diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..287b44282f88b500e77d201b9a3f3676688e5522 --- /dev/null +++ b/Makefile @@ -0,0 +1,33 @@ + +CC=gcc +FC=gfortran + +OPTS=-DTIMER -DCOUNTER -DCPU_TPS=2.67e9 +CFLAGS=-O3 -g -std=gnu99 -Wall -Werror -march=native -mtune=native -ffast-math -fomit-frame-pointer -malign-double -fstrict-aliasing -fopenmp +# CFLAGS=-O0 -g -std=gnu99 -Wall -Werror -fopenmp +LDFLAGS=-lm -lpthread -fopenmp + +FFLAGS=$(CFLAGS) + +OBJS=space.o runner.o test.o + +default: all + +depfile: $(wildcard *.c) $(wildcard *.h) + $(CC) -MM *.c *.h > depfile + +include depfile + +%.o: Makefile + +%.o: %.c Makefile + $(CC) -c $(CFLAGS) $(OPTS) $< -o $@ +# $(CC) -c $(CFLAGS) $(OPTS) $< -dA -S + +test: $(OBJS) + gcc $^ $(LDFLAGS) -o test + +all: test + +clean: + rm -f $(OBJS) test diff --git a/Opteron6174/test2.dump b/Opteron6174/test2.dump new file mode 100644 index 0000000000000000000000000000000000000000..9f4b5ca55bbc9f86aafdf2e4e94293a99c57a1c7 --- /dev/null +++ b/Opteron6174/test2.dump @@ -0,0 +1,48 @@ +4725.400 5722.778 8400.450 3194.839 120.059 105.812 56.976 5.655 0.000 0.000 19072.309 0.000 0.000 +4919.301 5755.781 8710.699 3294.436 145.669 130.172 68.079 5.951 0.989 1.456 9838.910 0.000 0.000 +4986.648 5794.996 8912.560 3360.085 198.966 178.689 108.065 6.243 5.700 7.087 6702.370 0.000 0.000 +5259.090 5782.343 9050.539 3391.693 206.106 186.435 105.906 6.455 5.320 5.107 5129.911 0.000 0.000 +5069.935 5801.673 9141.677 3427.429 254.263 228.474 140.110 6.954 12.026 13.356 4109.165 0.000 0.000 +5144.622 5772.367 9238.842 3460.168 240.814 217.191 127.767 6.964 10.301 10.270 3450.979 0.000 0.000 +5154.364 5800.711 9217.054 3457.825 310.817 271.857 166.828 7.499 31.194 33.498 2972.789 0.000 0.000 +5134.256 5778.530 9162.365 3433.527 274.115 242.285 143.293 7.479 19.334 19.719 2583.830 0.000 0.000 +5157.989 5803.972 9253.761 3458.642 345.791 303.071 185.356 8.256 38.970 41.899 2332.102 0.000 0.000 +5238.888 5784.603 9245.546 3459.337 304.001 263.466 158.093 8.064 26.586 26.403 2093.112 0.000 0.000 +5194.632 5806.809 9286.337 3476.360 386.899 326.131 197.683 8.884 60.838 64.860 1917.308 0.000 0.000 +5177.886 5780.655 9305.192 3478.527 351.232 291.278 170.044 8.479 55.968 57.658 1752.747 0.000 0.000 +5282.438 5814.123 9370.344 3487.279 439.419 354.893 209.715 9.268 90.429 96.766 1643.057 0.000 0.000 +5282.634 5785.133 9380.112 3494.921 368.999 309.481 184.850 8.954 50.828 50.418 1519.213 0.000 0.000 +5299.171 5814.226 9353.443 3493.089 501.669 374.055 221.973 9.862 132.895 138.850 1431.006 0.000 0.000 +5353.611 5783.394 9414.077 3502.090 404.104 325.553 197.225 9.566 67.871 67.871 1348.012 0.000 0.000 +5348.965 5817.267 9426.694 3506.508 490.604 375.330 226.972 10.119 110.522 113.377 1271.295 0.000 0.000 +5482.762 5794.389 9596.688 3515.471 456.408 346.792 208.519 9.985 99.973 99.417 1214.456 0.000 0.000 +5510.358 5821.199 9434.239 3515.476 541.956 392.489 238.131 10.826 137.361 139.270 1151.907 0.000 0.000 +5473.950 5784.403 9555.850 3524.260 521.190 362.134 215.772 10.350 151.132 151.251 1095.725 0.000 0.000 +5451.099 5823.864 9470.511 3526.494 604.650 407.376 247.352 11.149 187.478 191.021 1046.104 0.000 0.000 +5505.379 5797.034 9522.470 3532.581 525.123 382.731 230.351 10.745 135.603 132.723 998.445 0.000 0.000 +5477.417 5828.421 9518.820 3535.798 656.047 441.104 265.603 12.163 204.144 206.721 962.972 0.000 0.000 +5583.622 5806.082 9566.668 3544.756 557.810 391.739 239.332 11.165 152.257 148.435 923.460 0.000 0.000 +5515.739 5836.443 9570.943 3550.382 748.315 463.250 275.140 12.299 276.417 278.800 895.218 0.000 0.000 +5587.893 5817.037 9645.617 3558.896 595.736 409.485 250.071 11.453 173.158 168.782 859.240 0.000 0.000 +5542.670 5842.593 9748.742 3572.753 770.110 473.833 286.208 12.795 276.521 276.787 839.777 0.000 0.000 +5701.095 5822.677 9928.963 3604.407 651.649 436.940 264.515 12.132 205.074 198.781 816.121 0.000 0.000 +5657.524 5849.707 9725.956 3586.328 773.924 487.765 294.339 13.456 254.027 251.461 786.797 0.000 0.000 +5777.192 5822.372 9935.261 3617.644 743.599 456.303 276.207 12.457 279.953 272.829 768.997 0.000 0.000 +5660.033 5851.597 9713.186 3599.779 820.166 493.151 295.179 13.596 301.961 298.833 738.352 0.000 0.000 +5640.843 5807.036 9798.463 3591.253 629.146 351.585 180.046 14.269 265.955 256.574 709.472 0.000 0.000 +5714.651 5851.018 9832.902 3614.952 772.222 401.663 197.861 17.038 337.647 332.804 695.977 0.000 0.000 +5724.058 5807.343 9806.919 3602.480 656.785 355.842 183.545 14.471 288.564 277.148 668.951 0.000 0.000 +5681.948 5852.792 9813.468 3623.392 877.910 416.657 201.330 17.454 430.776 425.555 669.068 0.000 0.000 +6265.551 5826.791 9949.478 3645.344 1030.717 661.847 208.148 17.762 568.915 424.458 663.718 0.000 0.000 +5763.804 5862.045 10168.989 3683.041 966.853 442.515 222.032 19.469 485.971 479.159 650.065 0.000 0.000 +5932.583 5822.891 10004.099 3656.150 763.985 405.568 205.704 17.345 343.667 327.326 624.866 0.000 0.000 +5890.683 5882.241 10203.252 3737.363 979.443 468.428 233.404 21.593 474.652 465.229 613.284 0.000 0.000 +5907.127 5828.063 9983.026 3645.104 777.450 422.543 212.224 18.077 338.502 320.211 593.760 0.000 0.000 +5973.892 5878.569 10414.183 3776.429 988.204 502.585 245.691 23.177 452.688 437.958 593.734 0.000 0.000 +5999.062 5843.136 10431.820 3776.043 886.512 466.573 244.742 21.303 401.304 379.597 584.426 0.000 0.000 +5802.772 5879.462 10255.706 3782.129 1131.678 511.443 243.752 23.627 582.060 569.901 569.950 0.000 0.000 +6040.364 5838.864 10172.545 3704.922 927.791 486.075 238.959 21.438 423.612 400.392 545.770 0.000 0.000 +5974.247 5889.443 10785.004 3906.344 1201.819 582.030 286.344 28.453 581.319 563.692 565.256 0.000 0.000 +6185.360 5871.308 10675.076 3862.395 1044.241 524.659 278.511 25.544 497.958 468.343 551.707 0.000 0.000 +6098.932 5901.716 10787.633 3909.595 1242.428 582.900 286.725 28.737 615.565 592.976 536.952 0.000 0.000 +6237.389 5863.149 10836.057 3899.096 1144.514 553.592 292.793 27.093 571.810 539.514 527.113 0.000 0.000 diff --git a/cycle.h b/cycle.h new file mode 100644 index 0000000000000000000000000000000000000000..e357a017c79a6e9befb73e2b988bd23918b66f37 --- /dev/null +++ b/cycle.h @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2003, 2007-8 Matteo Frigo + * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + + +/* machine-dependent cycle counters code. Needs to be inlined. */ + +/***************************************************************************/ +/* To use the cycle counters in your code, simply #include "cycle.h" (this + file), and then use the functions/macros: + + ticks getticks(void); + + ticks is an opaque typedef defined below, representing the current time. + You extract the elapsed time between two calls to gettick() via: + + double elapsed(ticks t1, ticks t0); + + which returns a double-precision variable in arbitrary units. You + are not expected to convert this into human units like seconds; it + is intended only for *comparisons* of time intervals. + + (In order to use some of the OS-dependent timer routines like + Solaris' gethrtime, you need to paste the autoconf snippet below + into your configure.ac file and #include "config.h" before cycle.h, + or define the relevant macros manually if you are not using autoconf.) +*/ + +/***************************************************************************/ +/* This file uses macros like HAVE_GETHRTIME that are assumed to be + defined according to whether the corresponding function/type/header + is available on your system. The necessary macros are most + conveniently defined if you are using GNU autoconf, via the tests: + + dnl --------------------------------------------------------------------- + + AC_C_INLINE + AC_HEADER_TIME + AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) + + AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H +#include <sys/time.h> +#endif]) + + AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time]) + + dnl Cray UNICOS _rtc() (real-time clock) intrinsic + AC_MSG_CHECKING([for _rtc intrinsic]) + rtc_ok=yes + AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H +#include <intrinsics.h> +#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no]) + AC_MSG_RESULT($rtc_ok) + + dnl --------------------------------------------------------------------- +*/ + +/***************************************************************************/ + +#if TIME_WITH_SYS_TIME +# include <sys/time.h> +# include <time.h> +#else +# if HAVE_SYS_TIME_H +# include <sys/time.h> +# else +# include <time.h> +# endif +#endif + +#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \ +{ \ + return (double)t1 - (double)t0; \ +} + +/*----------------------------------------------------------------*/ +/* Solaris */ +#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) +typedef hrtime_t ticks; + +#define getticks gethrtime + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* AIX v. 4+ routines to read the real-time clock or time-base register */ +#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER) +typedef timebasestruct_t ticks; + +static __inline ticks getticks(void) +{ + ticks t; + read_real_time(&t, TIMEBASE_SZ); + return t; +} + +static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ +{ + time_base_to_time(&t1, TIMEBASE_SZ); + time_base_to_time(&t0, TIMEBASE_SZ); + return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + + ((double)t1.tb_low - (double)t0.tb_low)); +} + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PowerPC ``cycle'' counter using the time base register. + */ +#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned int tbl, tbu0, tbu1; + + do { + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); + } while (tbu0 != tbu1); + + return (((unsigned long long)tbu0) << 32) | tbl; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime, + from Carbon, requires no additional libraries to be linked). */ +#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER) +#include <mach/mach_time.h> +typedef uint64_t ticks; +#define getticks mach_absolute_time +INLINE_ELAPSED(__inline__) +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * Pentium cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +#ifndef INLINE +# if __GNUC__ && !__GNUC_STDC_INLINE__ +# define INLINE extern inline +# else +# define INLINE inline +# endif +#endif +INLINE ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("rdtsc": "=A" (ret)); + /* no input, nothing else clobbered */ + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/* Visual C++ -- thanks to Morten Nissov for his help with this */ +#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER) +#include <windows.h> +typedef LARGE_INTEGER ticks; +#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ + +static __inline ticks getticks(void) +{ + ticks retval; + + __asm { + RDTSC + mov retval.HighPart, edx + mov retval.LowPart, eax + } + return retval; +} + +static __inline double elapsed(ticks t1, ticks t0) +{ + return (double)t1.QuadPart - (double)t0.QuadPart; +} + +#define HAVE_TICK_COUNTER +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/*----------------------------------------------------------------*/ +/* + * X86-64 cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +#ifndef INLINE +# if __GNUC__ && !__GNUC_STDC_INLINE__ +# define INLINE extern inline +# else +# define INLINE inline +# endif +#endif +INLINE ticks getticks(void) +{ + unsigned a, d; + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((ticks)a) | (((ticks)d) << 32); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori. + NOTE: this code will fail to link unless you use the -Masmkeyword compiler + option (grrr). */ +#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; +static ticks getticks(void) +{ + asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); +} +INLINE_ELAPSED(__inline__) +#define HAVE_TICK_COUNTER +#endif + +/* Visual C++, courtesy of Dirk Michaelis */ +#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER) + +#include <intrin.h> +#pragma intrinsic(__rdtsc) +typedef unsigned __int64 ticks; +#define getticks __rdtsc +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * IA64 cycle counter + */ + +/* intel's icc/ecc compiler */ +#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; +#include <ia64intrin.h> + +static __inline__ ticks getticks(void) +{ + return __getReg(_IA64_REG_AR_ITC); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* gcc */ +#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */ +#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER) +#include <machine/sys/inline.h> +typedef unsigned long ticks; + +static inline ticks getticks(void) +{ + ticks ret; + + ret = _Asm_mov_from_ar (_AREG_ITC); + return ret; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/* Microsoft Visual C++ */ +#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER) +typedef unsigned __int64 ticks; + +# ifdef __cplusplus +extern "C" +# endif +ticks __getReg(int whichReg); +#pragma intrinsic(__getReg) + +static __inline ticks getticks(void) +{ + volatile ticks temp; + temp = __getReg(3116); + return temp; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PA-RISC cycle counter + */ +#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +# ifdef __GNUC__ +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("mfctl 16, %0": "=r" (ret)); + /* no input, nothing else clobbered */ + return ret; +} +# else +# include <machine/inline.h> +static inline unsigned long getticks(void) +{ + register ticks ret; + _MFCTL(16, ret); + return ret; +} +# endif + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* S390, courtesy of James Treacy */ +#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks cycles; + __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc"); + return cycles; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) +/* + * The 32-bit cycle counter on alpha overflows pretty quickly, + * unfortunately. A 1GHz machine overflows in 4 seconds. + */ +typedef unsigned int ticks; + +static __inline__ ticks getticks(void) +{ + unsigned long cc; + __asm__ __volatile__ ("rpcc %0" : "=r"(cc)); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + __asm__ __volatile__("rd %%tick, %0" : "=r" (ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) +# include <c_asm.h> +typedef unsigned int ticks; + +static __inline ticks getticks(void) +{ + unsigned long cc; + cc = asm("rpcc %v0"); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +/* SGI/Irix */ +#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) +typedef struct timespec ticks; + +static inline ticks getticks(void) +{ + struct timespec t; + clock_gettime(CLOCK_SGI_CYCLE, &t); + return t; +} + +static inline double elapsed(ticks t1, ticks t0) +{ + return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 + + ((double)t1.tv_nsec - (double)t0.tv_nsec); +} +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* Cray UNICOS _rtc() intrinsic function */ +#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) +#ifdef HAVE_INTRINSICS_H +# include <intrinsics.h> +#endif + +typedef long long ticks; + +#define getticks _rtc + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* MIPS ZBus */ +#if HAVE_MIPS_ZBUS_TIMER +#if defined(__mips__) && !defined(HAVE_TICK_COUNTER) +#include <sys/mman.h> +#include <unistd.h> +#include <fcntl.h> + +typedef uint64_t ticks; + +static inline ticks getticks(void) +{ + static uint64_t* addr = 0; + + if (addr == 0) + { + uint32_t rq_addr = 0x10030000; + int fd; + int pgsize; + + pgsize = getpagesize(); + fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0); + if (fd < 0) { + perror("open"); + return NULL; + } + addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr); + close(fd); + if (addr == (uint64_t *)-1) { + perror("mmap"); + return NULL; + } + } + + return *addr; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif +#endif /* HAVE_MIPS_ZBUS_TIMER */ + diff --git a/data2/Density.txt.REMOVED.git-id b/data2/Density.txt.REMOVED.git-id new file mode 100644 index 0000000000000000000000000000000000000000..ca18fbb99d20e3aab423b998ca98bb4151619fc2 --- /dev/null +++ b/data2/Density.txt.REMOVED.git-id @@ -0,0 +1 @@ +3afe2c7e0c425a30cdd406cfb93f305b5a55dafd \ No newline at end of file diff --git a/data2/Mass.txt.REMOVED.git-id b/data2/Mass.txt.REMOVED.git-id new file mode 100644 index 0000000000000000000000000000000000000000..707be23bb0529c1b7d55045b37959c7ab6363369 --- /dev/null +++ b/data2/Mass.txt.REMOVED.git-id @@ -0,0 +1 @@ +98c93e9342eca73ce75144010612d4e0d47e3650 \ No newline at end of file diff --git a/data2/SmoothingLength.txt.REMOVED.git-id b/data2/SmoothingLength.txt.REMOVED.git-id new file mode 100644 index 0000000000000000000000000000000000000000..f944fe4f647ff8d8421a07aa2c175ab5a7edb26f --- /dev/null +++ b/data2/SmoothingLength.txt.REMOVED.git-id @@ -0,0 +1 @@ +d57fee6402e9e77800e477525643d943f172dcbc \ No newline at end of file diff --git a/lock.h b/lock.h new file mode 100644 index 0000000000000000000000000000000000000000..4357ec0c9aa259f304267cb11aa02aaa316d4276 --- /dev/null +++ b/lock.h @@ -0,0 +1,29 @@ + +/* Get the inlining right. */ +#ifndef INLINE +# if __GNUC__ && !__GNUC_STDC_INLINE__ +# define INLINE extern inline +# else +# define INLINE inline +# endif +#endif + +#ifdef PTHREAD_LOCK + #define lock_type pthread_spinlock_t + #define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 ) + #define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 ) + #define lock_lock( l ) ( pthread_spin_lock( l ) != 0 ) + #define lock_trylock( l ) ( pthread_spin_lock( l ) != 0 ) + #define lock_unlock( l ) ( pthread_spin_unlock( l ) != 0 ) +#else + #define lock_type volatile int + #define lock_init( l ) ( *l = 0 ) + #define lock_destroy( l ) 0 + INLINE int lock_lock ( volatile int *l ) { + while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 ) + while( *l ); + return 0; + } + #define lock_trylock( l ) ( ( *(l) ) ? 1 : __sync_val_compare_and_swap( l , 0 , 1 ) ) + #define lock_unlock( l ) ( *l = 0 ) +#endif diff --git a/mkplots.m b/mkplots.m new file mode 100644 index 0000000000000000000000000000000000000000..2118fe183e0ed66f337554e0d02bac633f88b07b --- /dev/null +++ b/mkplots.m @@ -0,0 +1,100 @@ + +%% Scaling and efficiency plots for the first test +test = importdata( 'test_nosort.totals'); +ncores = size( test , 1 ); + +clf +subplot('position',[ 0.05 , 0.1 , 0.4 , 0.8 ]); +plot( 1:ncores , test(1,10) ./ test(:,10) , '-k' , 'LineWidth' , 2 ); hold on; +xlabel('nr. cores'); +plot( [1,ncores] , [1,ncores] , ':k' , 'LineWidth' , 1.4 ); +hold off; +title('Speedup'); +axis([ 1 , ncores , 0 , ncores ]); + +subplot('position',[ 0.52 0.1 , 0.4 , 0.8 ]); +plot( 1:ncores , test(1,10) ./ test(:,10) ./ (1:ncores)' , '-k' , 'LineWidth' , 2 ); hold on; +plot( [1,ncores] , [1,1] , ':k' , 'LineWidth' , 1.4 ); +text(4*ncores/5,1,sprintf('%.2f',min(test(:,10))),'BackgroundColor',[1,1,1],'FontSize',12); +xlabel('nr. cores'); +hold off; +title('Efficiency'); +axis([ 1 , ncores , 0 , 1.2 ]); + +% Print this plot +set( gcf , 'PaperSize' , 2.3*[ 8.5 4.5 ] ); +set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 8 4 ] ); +print -depsc2 test.eps +!epstopdf test.eps + + +%% Scaling and efficiency plots for the second test +test2 = importdata( 'test2.totals'); +ncores = size( test2 , 1 ); + +clf +subplot('position',[ 0.05 , 0.1 , 0.4 , 0.8 ]); +plot( 1:ncores , test2(1,10) ./ test2(:,10) , '-k' , 'LineWidth' , 2 ); hold on; +xlabel('nr. cores'); +plot( [1,ncores] , [1,ncores] , ':k' , 'LineWidth' , 1.4 ); +hold off; +title('Speedup'); +axis([ 1 , ncores , 0 , ncores ]); + +subplot('position',[ 0.52 0.1 , 0.4 , 0.8 ]); +plot( 1:ncores , test2(1,10) ./ test2(:,10) ./ (1:ncores)' , '-k' , 'LineWidth' , 2 ); hold on; +plot( [1,ncores] , [1,1] , ':k' , 'LineWidth' , 1.4 ); +text(4*ncores/5,1,sprintf('%.2f',min(test2(:,10))),'BackgroundColor',[1,1,1],'FontSize',12); +xlabel('nr. cores'); +hold off; +title('Efficiency'); +axis([ 1 , ncores , 0 , 1.2 ]); + +% Print this plot +set( gcf , 'PaperSize' , 2.3*[ 8.5 4.5 ] ); +set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 8 4 ] ); +print -depsc2 test2.eps +!epstopdf test2.eps + + +%% Components of the first test +test = importdata( 'test_nosort.totals'); +ncores = size( test , 1 ); +cols = [ 1 , 2 , 3 , 5 ]; + +clf +subplot('position',[ 0.1 , 0.1 , 0.8 , 0.8 ]); +plot( 1:ncores , test(:,cols) , 'LineWidth' , 2 ); hold on; +legend( 'sort' , 'self' , 'pair' , 'get task' , 'Location' , 'NorthWest' ); +xlabel('nr. cores'); +hold off; +title('ms per task type'); +axis([ 1 , ncores , 0 , max(max(test(:,cols)))*1.1 ]); + +% Print this plot +set( gcf , 'PaperSize' , 2.3*[ 4.5 4 ] ); +set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 4 4 ] ); +print -depsc2 parts.eps +!epstopdf parts.eps + + +%% Components of the second test +test2 = importdata( 'test2.totals'); +ncores = size( test2 , 1 ); +cols = [ 1 , 2 , 3 , 5 ]; + +clf +subplot('position',[ 0.1 , 0.1 , 0.8 , 0.8 ]); +plot( 1:ncores , test2(:,cols) , 'LineWidth' , 2 ); hold on; +legend( 'sort' , 'self' , 'pair' , 'get task' , 'Location' , 'NorthWest' ); +xlabel('nr. cores'); +hold off; +title('ms per task type'); +axis([ 1 , ncores , 0 , max(max(test2(:,cols)))*1.1 ]); + +% Print this plot +set( gcf , 'PaperSize' , 2.3*[ 4.5 4 ] ); +set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 4 4 ] ); +print -depsc2 parts2.eps +!epstopdf parts2.eps + diff --git a/runner.c b/runner.c new file mode 100644 index 0000000000000000000000000000000000000000..f6dea73e23069010a0bd6567189c7a1a1058e165 --- /dev/null +++ b/runner.c @@ -0,0 +1,1544 @@ + + +/* Some standard headers. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> + +/* Local headers. */ +#include "cycle.h" +#include "lock.h" +#include "space.h" +#include "runner.h" + +/* Error macro. */ +#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); } + +/* Convert cell location to ID. */ +#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) + +/* The timers. */ +ticks runner_timer[ runner_timer_count ]; + +/* The counters. */ +int runner_counter[ runner_counter_count ]; + + + +const float runner_shift[13*3] = { + 5.773502691896258e-01 , 5.773502691896258e-01 , 5.773502691896258e-01 , + 7.071067811865475e-01 , 7.071067811865475e-01 , 0.0 , + 5.773502691896258e-01 , 5.773502691896258e-01 , -5.773502691896258e-01 , + 7.071067811865475e-01 , 0.0 , 7.071067811865475e-01 , + 1.0 , 0.0 , 0.0 , + 7.071067811865475e-01 , 0.0 , -7.071067811865475e-01 , + 5.773502691896258e-01 , -5.773502691896258e-01 , 5.773502691896258e-01 , + 7.071067811865475e-01 , -7.071067811865475e-01 , 0.0 , + 5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 , + 0.0 , 7.071067811865475e-01 , 7.071067811865475e-01 , + 0.0 , 1.0 , 0.0 , + 0.0 , 7.071067811865475e-01 , -7.071067811865475e-01 , + 0.0 , 0.0 , 1.0 , + }; +const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , + 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; + + +/** + * @brief Sort the tasks IDs according to their weight and constraints. + * + * @param q The #queue. + */ + +void queue_sort ( struct queue *q ) { + + struct { + short int lo, hi; + } qstack[20]; + int qpos, i, j, k, lo, hi, imin, temp; + int pivot_weight, pivot_wait; + int *weight, *wait; + int *data = q->tid; + struct task *t; + + /* Allocate and pre-compute each task's weight. */ + if ( ( weight = (int *)alloca( sizeof(int) * q->count ) ) == NULL || + ( wait = (int *)alloca( sizeof(int) * q->count ) ) == NULL ) + error( "Failed to allocate weight buffer." ); + for ( k = 0 ; k < q->count ; k++ ) { + t = &q->tasks[ q->tid[k] ]; + switch ( t->type ) { + case tid_self: + wait[k] = t->rank; + weight[k] = 0; // t->ci->count * t->ci->count; + break; + case tid_pair: + wait[k] = t->rank; + weight[k] = 0; // t->ci->count * t->cj->count; + break; + case tid_sub: + wait[k] = t->rank; + weight[k] = 0; // (t->cj == NULL) ? t->ci->count * t->ci->count : t->ci->count * t->cj->count; + break; + case tid_sort: + wait[k] = t->rank; + weight[k] = 0; // t->ci->count; + break; + } + } + + /* Sort tasks. */ + qstack[0].lo = 0; qstack[0].hi = q->count - 1; qpos = 0; + while ( qpos >= 0 ) { + lo = qstack[qpos].lo; hi = qstack[qpos].hi; + qpos -= 1; + if ( hi - lo < 15 ) { + for ( i = lo ; i < hi ; i++ ) { + imin = i; + for ( j = i+1 ; j <= hi ; j++ ) + if ( ( wait[ j ] < wait[ imin ] ) || + ( wait[ j ] == wait[ imin ] && weight[ j ] > weight[ imin ] ) ) + if ( imin != i ) { + temp = data[imin]; data[imin] = data[i]; data[i] = temp; + temp = wait[imin]; wait[imin] = wait[i]; wait[i] = temp; + temp = weight[imin]; weight[imin] = weight[i]; weight[i] = temp; + } + } + } + else { + pivot_weight = weight[ ( lo + hi ) / 2 ]; + pivot_wait = wait[ ( lo + hi ) / 2 ]; + i = lo; j = hi; + while ( i <= j ) { + while ( ( wait[ i ] < pivot_wait ) || + ( wait[ i ] == pivot_wait && weight[ i ] > pivot_weight ) ) + i++; + while ( ( wait[ j ] > pivot_wait ) || + ( wait[ j ] == pivot_wait && weight[ j ] < pivot_weight ) ) + j--; + if ( i <= j ) { + if ( i < j ) { + temp = data[i]; data[i] = data[j]; data[j] = temp; + temp = wait[i]; wait[i] = wait[j]; wait[j] = temp; + temp = weight[i]; weight[i] = weight[j]; weight[j] = temp; + } + i += 1; j -= 1; + } + } + if ( j > ( lo + hi ) / 2 ) { + if ( lo < j ) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + if ( i < hi ) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + } + else { + if ( i < hi ) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + if ( lo < j ) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + } + } + } + + } + + +/** + * @brief Sort the tasks in topological order over all queues. + * + * @param r The #runner. + */ + +void runner_ranktasks ( struct runner *r ) { + + int i, j = 0, k, temp, left = 0, rank; + struct task *t; + struct space *s = r->s; + int *tid; + + /* Run throught the tasks and get all the waits right. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) + s->tasks[k].unlock_tasks[j]->wait += 1; + } + + /* Allocate and init the task-ID array. */ + if ( ( tid = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL ) + error( "Failed to allocate temporary tid array." ); + for ( k = 0 ; k < s->nr_tasks ; k++ ) + tid[k] = k; + + /* Main loop. */ + for ( rank = 0 ; left < s->nr_tasks ; rank++ ) { + + /* Load the tids of tasks with no waits. */ + for ( k = left ; k < s->nr_tasks ; k++ ) + if ( s->tasks[ tid[k] ].wait == 0 ) { + temp = tid[j]; tid[j] = tid[k]; tid[k] = temp; + j += 1; + } + + /* Traverse the task tree and add tasks with no weight. */ + for ( i = left ; i < j ; i++ ) { + t = &s->tasks[ tid[i] ]; + t->rank = rank; + s->tasks_ind[i] = t - s->tasks; + /* printf( "runner_ranktasks: task %i of type %s has rank %i.\n" , i , + (t->type == tid_self) ? "self" : (t->type == tid_pair) ? "pair" : "sort" , rank ); */ + for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) + t->unlock_tasks[k]->wait -= 1; + } + + /* The new left (no, not tony). */ + left = j; + + } + + /* Release the temporary array. */ + free(tid); + + } + + +/** + * @brief Compute the interactions between a cell pair. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ + +void runner_dopair_naive ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { + + struct runner *r = rt->r; + int pid, pjd, k, count_i = ci->count, count_j = cj->count; + double shift[3] = { 0.0 , 0.0 , 0.0 }; + struct part *pi, *pj, *parts_i = ci->parts, *parts_j = cj->parts; + double dx[3], pix[3], ri, ri2, r2; + TIMER_TIC + + /* Get the relative distance between the pairs, wrapping. */ + for ( k = 0 ; k < 3 ; k++ ) { + if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 ) + shift[k] = r->s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 ) + shift[k] = -r->s->dim[k]; + } + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Loop over the parts in ci. */ + for ( pid = 0 ; pid < count_i ; pid++ ) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ pid ]; + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = pi->x[k] - shift[k]; + ri = pi->r; + ri2 = ri * ri; + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < count_j ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[ pjd ]; + + /* Compute the pairwise distance. */ + r2 = 0.0; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < ri2 || r2 < pj->r*pj->r ) { + + iact( r2 , ri , pj->r , &pi->count , &pj->count , &pi->icount , &pj->icount ); + + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + #ifdef TIMER_VERBOSE + printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(runner_timer_dopair)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(runner_timer_dopair); + #endif + + + } + + +/** + * @brief Compute the interactions between a cell pair. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ + +void runner_dopair ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { + + struct runner *r = rt->r; + int pid, pjd, k, sid; + double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }; + struct cell *temp; + struct entry *sort_i, *sort_j; + struct part *pi, *pj, *parts_i, *parts_j; + double dx[3], pix[3], pjx[3], ri, ri2, rj, rj2, r2, di, dj; + double ri_max, rj_max, di_max, dj_min; + int count_i, count_j; + TIMER_TIC + + /* Get the relative distance between the pairs, wrapping. */ + for ( k = 0 ; k < 3 ; k++ ) { + if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 ) + shift[k] = r->s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 ) + shift[k] = -r->s->dim[k]; + } + + /* Get the sorting index. */ + for ( sid = 0 , k = 0 ; k < 3 ; k++ ) + sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 ); + + /* Switch the cells around? */ + if ( runner_flip[sid] ) { + temp = ci; ci = cj; cj = temp; + for ( k = 0 ; k < 3 ; k++ ) + shift[k] = -shift[k]; + } + sid = sortlistID[sid]; + + /* Get the cutoff shift. */ + for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ ) + rshift += shift[k]*runner_shift[ 3*sid + k ]; + + /* printf( "runner_dopair: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); */ + /* for ( ri = 0 , k = 0 ; k < ci->count ; k++ ) + ri += ci->parts[k].r; + for ( rj = 0 , k = 0 ; k < cj->count ; k++ ) + rj += cj->parts[k].r; + printf( "runner_dopair: avg. radii %g/%g for h=%g at depth=%i.\n" , ri/ci->count , rj/cj->count , ci->h[0] , ci->depth ); fflush(stdout); */ + + /* Pick-out the sorted lists. */ + sort_i = &ci->sort[ sid*(ci->count + 1) ]; + sort_j = &cj->sort[ sid*(cj->count + 1) ]; + + /* Get some other useful values. */ + ri_max = ci->r_max - rshift; rj_max = cj->r_max - rshift; + count_i = ci->count; count_j = cj->count; + parts_i = ci->parts; parts_j = cj->parts; + di_max = sort_i[count_i-1].d; + dj_min = sort_j[0].d; + + /* if ( ci->split && cj->split && sid == 4 ) + printf( "boing!\n" ); */ + + /* Loop over the parts in ci. */ + for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + ri_max > dj_min ; pid-- ) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ sort_i[ pid ].i ]; + ri = pi->r - rshift; + di = sort_i[pid].d + ri; + if ( di < dj_min ) + continue; + + ri2 = pi->r * pi->r; + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = pi->x[k] - shift[k]; + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[ sort_j[pjd].i ]; + + /* Compute the pairwise distance. */ + r2 = 0.0; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < ri2 ) { + + iact( r2 , ri , pj->r , &pi->count , &pj->count , &pi->icount , &pj->icount ); + + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); + tic = getticks(); */ + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - rj_max < di_max ; pjd++ ) { + + /* Get a hold of the jth part in cj. */ + pj = &parts_j[ sort_j[ pjd ].i ]; + rj = pj->r + rshift; + dj = sort_j[pjd].d - rj; + if ( dj > di_max ) + continue; + + for ( k = 0 ; k < 3 ; k++ ) + pjx[k] = pj->x[k] + shift[k]; + rj2 = pj->r * pj->r; + + /* Loop over the parts in ci. */ + for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) { + + /* Get a pointer to the jth particle. */ + pi = &parts_i[ sort_i[pid].i ]; + + /* Compute the pairwise distance. */ + r2 = 0.0; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pi->x[k] - pjx[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < rj2 && r2 > pi->r*pi->r ) { + + iact( r2 , 0 , rj , NULL , &pj->count , NULL , &pj->icount ); + + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + #ifdef TIMER_VERBOSE + printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(runner_timer_dopair))) / CPU_TPS * 1000 ); + #else + TIMER_TOC(runner_timer_dopair); + #endif + + } + + +/** + * @brief Compute the cell self-interaction. + * + * @param r The #runner. + * @param c The #cell. + */ + +void runner_doself ( struct runner_thread *rt , struct cell *c ) { + + int k, pid, pjd, count = c->count; + double pix[3], dx[3], ri, ri2, r2; + struct part *pi, *pj, *parts = c->parts; + TIMER_TIC + + if ( c->split ) + error( "Split cell should not have self-interactions." ); + + /* Loop over the particles in the cell. */ + for ( pid = 0 ; pid < count ; pid++ ) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + + /* Get the particle position and radius. */ + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = pi->x[k]; + ri = pi->r; + ri2 = ri * ri; + + /* Loop over the other particles .*/ + for ( pjd = pid+1 ; pjd < count ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + pj = &parts[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < ri2 || r2 < pj->r*pj->r ) { + + iact( r2 , ri , pj->r , &pi->count , &pj->count , &pi->icount , &pj->icount ); + + } + + } /* loop over all other particles. */ + + } /* loop over all particles. */ + + #ifdef TIMER_VERBOSE + printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(runner_timer_doself)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(runner_timer_doself); + #endif + + } + + +/** + * @brief Compute grouped sub-cell interactions + * + * @param r The #runner. + * @param c The #cell. + */ + +void runner_dosub ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags ) { + + int j, k; + + TIMER_TIC + + /* Different types of flags. */ + switch ( flags ) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: + for ( j = 0 ; j < 7 ; j++ ) + for ( k = j + 1 ; k < 8 ; k++ ) + runner_dopair( rt , ci->progeny[j] , ci->progeny[k] ); + break; + + case 1: /* ( 1 , 1 , 0 ) */ + runner_dopair( rt , ci->progeny[6] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[1] ); + break; + + case 3: /* ( 1 , 0 , 1 ) */ + runner_dopair( rt , ci->progeny[5] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[2] ); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + runner_dopair( rt , ci->progeny[4] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[4] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[4] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[4] , cj->progeny[3] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[3] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[3] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[3] ); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + runner_dopair( rt , ci->progeny[4] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[4] , cj->progeny[3] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[3] ); + break; + + case 7: /* ( 1 , -1 , 0 ) */ + runner_dopair( rt , ci->progeny[4] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[4] , cj->progeny[3] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[3] ); + break; + + case 9: /* ( 0 , 1 , 1 ) */ + runner_dopair( rt , ci->progeny[3] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[4] ); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + runner_dopair( rt , ci->progeny[2] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[2] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[2] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[2] , cj->progeny[5] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[5] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[5] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[5] ); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + runner_dopair( rt , ci->progeny[2] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[2] , cj->progeny[5] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[1] ); + runner_dopair( rt , ci->progeny[6] , cj->progeny[5] ); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + runner_dopair( rt , ci->progeny[1] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[1] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[1] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[1] , cj->progeny[6] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[3] , cj->progeny[6] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[5] , cj->progeny[6] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[0] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[2] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[4] ); + runner_dopair( rt , ci->progeny[7] , cj->progeny[6] ); + break; + + } + + + #ifdef TIMER_VERBOSE + printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(runner_timer_dosub)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(runner_timer_dosub); + #endif + + } + + +/** + * @brief Sort the entries in ascending order using QuickSort. + * + * @param sort The entries + * @param N The number of entries. + */ + +void runner_dosort_ascending ( struct entry *sort , int N ) { + + struct { + short int lo, hi; + } qstack[10]; + int qpos, i, j, lo, hi, imin; + struct entry temp; + float pivot; + + /* Sort parts in cell_i in decreasing order with quicksort */ + qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0; + while ( qpos >= 0 ) { + lo = qstack[qpos].lo; hi = qstack[qpos].hi; + qpos -= 1; + if ( hi - lo < 15 ) { + for ( i = lo ; i < hi ; i++ ) { + imin = i; + for ( j = i+1 ; j <= hi ; j++ ) + if ( sort[j].d < sort[imin].d ) + imin = j; + if ( imin != i ) { + temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp; + } + } + } + else { + pivot = sort[ ( lo + hi ) / 2 ].d; + i = lo; j = hi; + while ( i <= j ) { + while ( sort[i].d < pivot ) i++; + while ( sort[j].d > pivot ) j--; + if ( i <= j ) { + if ( i < j ) { + temp = sort[i]; sort[i] = sort[j]; sort[j] = temp; + } + i += 1; j -= 1; + } + } + if ( j > ( lo + hi ) / 2 ) { + if ( lo < j ) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + if ( i < hi ) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + } + else { + if ( i < hi ) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + if ( lo < j ) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + } + } + } + + } + + +/** + * @brief inline helper fuction to merge two entry arrays (forward). + * + * @param one the first array + * @param none the length of the first array + * @param two the second array + * @param ntwo the length of the second array + * @param dest the destination array. + */ + +inline void merge_forward ( struct entry *__restrict__ one , int none , struct entry *__restrict__ two , int ntwo , struct entry *__restrict__ dest ) { + + int i = 0, j = 0, k = 0; + + while ( j < none && k < ntwo ) + if ( one[j].d < two[k].d ) + dest[i++] = one[j++]; + else + dest[i++] = two[k++]; + if ( j == none ) + for ( ; k < ntwo ; k++ ) + dest[i++] = two[k]; + else + for ( ; j < none ; j++ ) + dest[i++] = one[j]; + + } + + +/** + * @brief inline helper fuction to merge two entry arrays (forward). + * + * @param one the first array + * @param none the length of the first array + * @param two the second array + * @param ntwo the length of the second array + * @param dest the destination array. + */ + +inline void merge_backward ( struct entry *__restrict__ one , int none , struct entry *__restrict__ two , int ntwo , struct entry *__restrict__ dest ) { + + int i = none + ntwo - 1, j = none - 1, k = ntwo - 1; + + while ( j >= 0 && k >= 0 ) + if ( one[j].d > two[k].d ) + dest[i--] = one[j--]; + else + dest[i--] = two[k--]; + if ( j < 0 ) + for ( ; k >= 0 ; k-- ) + dest[i--] = two[k]; + else + for ( ; j >= 0 ; j-- ) + dest[i--] = one[j]; + + } + + +/** + * @brief Sort the particles in the given cell along all cardinal directions. + * + * @param r The #runner. + * @param c The #cell. + */ + +void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) { + + struct entry *finger; + struct entry *fingers[8]; + struct part *parts = c->parts; + int j, k, count = c->count; + int cone, ctwo; + int i, ind, off[8], inds[8], temp_i; + float shift[3]; + float buff[8]; + struct cell *temp_c; + TIMER_TIC + + /* Does this cell even need to be sorted? */ + for ( temp_c = c ; temp_c != NULL && temp_c->nr_pairs == 0 ; temp_c = temp_c->parent ); + if ( temp_c == NULL ) + return; + + /* start by allocating the entry arrays. */ + if ( lock_lock( &c->lock ) != 0 ) + error( "Failed to lock cell." ); + if ( c->sort == NULL ) + if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->count + 1) * 13 ) ) == NULL ) + error( "Failed to allocate sort memory." ); + if ( lock_unlock( &c->lock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Does this cell have any progeny? */ + if ( c->split ) { + + /* Loop over the 13 different sort arrays. */ + for ( j = 0 ; j < 13 ; j++ ) { + + /* Has this sort array been flagged? */ + if ( !( flags & (1 << j) ) ) + continue; + + if ( 0 ) { + + /* Get a finger on the sorting array. */ + finger = &c->sort[ j*(count + 1) ]; + + /* Merge the two first sub-cells forward into this cell. */ + cone = c->progeny[0]->count; + ctwo = c->progeny[1]->count; + merge_forward( &c->progeny[0]->sort[ j*(cone + 1) ] , cone , + &c->progeny[1]->sort[ j*(ctwo + 1) ] , ctwo , + finger ); + + /* Merge-in the remaining arrays, alternating forward and + backward merges. */ + for ( k = 2 ; k < 8 ; k++ ) { + cone = cone + ctwo; + ctwo = c->progeny[k]->count; + if ( k & 1 ) + merge_forward( &finger[ count - cone ] , cone , + &c->progeny[k]->sort[ j*(ctwo + 1) ] , ctwo , + finger ); + else + merge_backward( finger , cone , + &c->progeny[k]->sort[ j*(ctwo + 1) ] , ctwo , + &finger[ count - cone - ctwo ] ); + } + + } + + else { + + /* Init the particle index offsets. */ + for ( off[0] = 0 , k = 1 ; k < 8 ; k++ ) + if ( c->progeny[k-1] != NULL ) + off[k] = off[k-1] + c->progeny[k-1]->count; + else + off[k] = off[k-1]; + + /* Init the entries and indices. */ + for ( k = 0 ; k < 8 ; k++ ) { + inds[k] = k; + if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) { + fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ]; + buff[k] = fingers[k]->d; + off[k] = off[k]; + } + else + buff[k] = FLT_MAX; + } + + /* Sort the buffer. */ + for ( i = 0 ; i < 7 ; i++ ) + for ( k = i+1 ; k < 8 ; k++ ) + if ( buff[ inds[k] ] < buff[ inds[i] ] ) { + temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i; + } + + /* For each entry in the new sort list. */ + finger = &c->sort[ j*(count + 1) ]; + for ( ind = 0 ; ind < count ; ind++ ) { + + /* Copy the minimum into the new sort array. */ + finger[ind].d = buff[inds[0]]; + finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; + + /* Update the buffer. */ + fingers[inds[0]] += 1; + buff[inds[0]] = fingers[inds[0]]->d; + + /* Find the smallest entry. */ + for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) { + temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i; + } + + } /* Merge. */ + + } + + /* Add a sentinel. */ + c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX; + c->sort[ j*(c->count + 1) + c->count ].i = 0; + + } /* loop over sort arrays. */ + + } /* progeny? */ + + /* Otherwise, just sort. */ + else { + + /* Loop over the different cell axes. */ + for ( j = 0 ; j < 13 ; j++ ) { + + /* Has this sort array been flagged? */ + if ( !( flags & (1 << j) ) ) + continue; + + /* Get the shift vector. */ + shift[0] = runner_shift[ 3*j + 0 ]; + shift[1] = runner_shift[ 3*j + 1 ]; + shift[2] = runner_shift[ 3*j + 2 ]; + + /* Fill the sort array. */ + finger = &c->sort[ j*(count + 1) ]; + for ( k = 0 ; k < count ; k++ ) { + finger[k].i = k; + finger[k].d = parts[k].x[0]*shift[0] + parts[k].x[1]*shift[1] + parts[k].x[2]*shift[2]; + } + + /* Add the sentinel. */ + finger[ c->count ].d = FLT_MAX; + finger[ c->count ].i = 0; + + /* Sort descending. */ + runner_dosort_ascending( finger , c->count ); + + } + + } + + /* Verify the sorting. */ + /* for ( j = 0 ; j < 13 ; j++ ) { + if ( !( flags & (1 << j) ) ) + continue; + finger = &c->sort[ j*(c->count + 1) ]; + for ( k = 1 ; k < c->count ; k++ ) { + if ( finger[k].d < finger[k-1].d ) + error( "Sorting failed, ascending array." ); + if ( finger[k].i >= c->count ) + error( "Sorting failed, indices borked." ); + } + } */ + + #ifdef TIMER_VERBOSE + printf( "runner_dosort[%02i]: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms.\n" , + rt->id , c->count , c->depth , + (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , + ((double)TIMER_TOC(runner_timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); + #else + TIMER_TOC(runner_timer_dosort); + #endif + + } + + +/** + * @brief Lock a cell and hold its parents. + * + * @param c The #cell. + */ + +int cell_locktree( struct cell *c ) { + + struct cell *finger, *finger2; + TIMER_TIC + + /* First of all, try to lock this cell. */ + if ( lock_trylock( &c->lock ) != 0 ) { + TIMER_TOC(runner_timer_tree); + return 1; + } + + /* Did somebody hold this cell in the meantime? */ + if ( c->hold ) { + + /* Unlock this cell. */ + if ( lock_unlock( &c->lock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Admit defeat. */ + TIMER_TOC(runner_timer_tree); + return 1; + + } + + /* Climb up the tree and lock/hold/unlock. */ + for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) { + + /* Lock this cell. */ + if ( lock_trylock( &finger->lock ) != 0 ) + break; + + /* Increment the hold. */ + __sync_fetch_and_add( &finger->hold , 1 ); + + /* Unlock the cell. */ + if ( lock_unlock( &finger->lock ) != 0 ) + error( "Failed to unlock cell." ); + + } + + /* If we reached the top of the tree, we're done. */ + if ( finger == NULL ) { + TIMER_TOC(runner_timer_tree); + return 0; + } + + /* Otherwise, we hit a snag. */ + else { + + /* Undo the holds up to finger. */ + for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent ) + __sync_fetch_and_sub( &finger2->hold , 1 ); + + /* Unlock this cell. */ + if ( lock_unlock( &c->lock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Admit defeat. */ + TIMER_TOC(runner_timer_tree); + return 1; + + } + + } + + +/** + * @brief Unock a cell's parents. + * + * @param c The #cell. + */ + +void cell_unlocktree( struct cell *c ) { + + struct cell *finger; + TIMER_TIC + + /* First of all, try to unlock this cell. */ + if ( lock_unlock( &c->lock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Climb up the tree and unhold the parents. */ + for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) + __sync_fetch_and_sub( &finger->hold , 1 ); + + TIMER_TOC(runner_timer_tree); + + } + + +/** + * @brief Implements a barrier for the #runner threads. + * + */ + +void runner_barrier( struct runner *r ) { + + /* First, get the barrier mutex. */ + if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 ) + error( "Failed to get barrier mutex." ); + + /* Wait for the barrier to close. */ + while ( r->barrier_count < 0 ) + if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) + error( "Eror waiting for barrier to close." ); + + /* Once I'm in, increase the barrier count. */ + r->barrier_count += 1; + + /* If all threads are in, send a signal... */ + if ( r->barrier_count == r->nr_threads ) + if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 ) + error( "Failed to broadcast barrier full condition." ); + + /* Wait for barrier to be released. */ + while ( r->barrier_count > 0 ) + if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) + error( "Error waiting for barrier to be released." ); + + /* Decrease the counter before leaving... */ + r->barrier_count += 1; + + /* If I'm the last one out, signal the condition again. */ + if ( r->barrier_count == 0 ) + if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 ) + error( "Failed to broadcast empty barrier condition." ); + + /* Last but not least, release the mutex. */ + if ( pthread_mutex_unlock( &r->barrier_mutex ) != 0 ) + error( "Failed to get unlock the barrier mutex." ); + + } + + +/** + * @brief The #runner main thread routine. + * + * @param data A pointer to this thread's data. + */ + +void *runner_main ( void *data ) { + + struct runner_thread *rt = (struct runner_thread *)data; + struct runner *r = rt->r; + struct space *s = r->s; + int threadID = rt->id; + int k, qid, naq, keep, tpq; + struct queue *queues[ r->nr_queues ], *myq; + struct task *t; + struct cell *ci, *cj; + #ifdef TIMER + ticks stalled; + #endif + + /* Main loop. */ + while ( 1 ) { + + /* Wait at the barrier. */ + runner_barrier( r ); + + /* Set some convenient local data. */ + keep = r->policy & runner_policy_keep; + myq = &r->queues[ threadID % r->nr_queues ]; + tpq = ceil( ((double)r->nr_threads) / r->nr_queues ); + stalled = 0; + + /* Set up the local list of active queues. */ + naq = r->nr_queues; + for ( k = 0 ; k < naq ; k++ ) + queues[k] = &r->queues[k]; + + /* Set up the local list of active queues. */ + naq = r->nr_queues; + for ( k = 0 ; k < naq ; k++ ) + queues[k] = &r->queues[k]; + + /* Loop while there are tasks... */ + while ( 1 ) { + + /* Remove any inactive queues. */ + for ( k = 0 ; k < naq ; k++ ) + if ( queues[k]->next == queues[k]->count ) { + naq -= 1; + queues[k] = queues[naq]; + k -= 1; + } + if ( naq == 0 ) + break; + + /* Get a task, how and from where depends on the policy. */ + TIMER_TIC + t = NULL; + if ( r->nr_queues == 1 ) { + t = queue_gettask( &r->queues[0] , 1 , 0 ); + } + else if ( r->policy & runner_policy_steal ) { + if ( ( myq->next == myq->count ) || + ( t = queue_gettask( myq , 0 , 0 ) ) == NULL ) { + TIMER_TIC2 + qid = rand() % naq; + keep = ( r->policy & runner_policy_keep ) && + ( myq->count <= myq->size-tpq ); + if ( myq->next == myq->count ) + COUNT(runner_counter_steal_empty); + else + COUNT(runner_counter_steal_stall); + t = queue_gettask( queues[qid] , 0 , keep ); + if ( t != NULL && keep ) { + COUNT(runner_counter_keep); + if ( lock_lock( &myq->lock ) != 0 ) + error( "Failed to get queue lock." ); + for ( k = myq->count ; k > myq->next ; k-- ) + myq->tid[k] = myq->tid[k-1]; + myq->tid[ myq->next ] = t - s->tasks; + myq->count += 1; + myq->next += 1; + if ( lock_unlock( &myq->lock ) != 0 ) + error( "Failed to unlock queue." ); + } + TIMER_TOC2(runner_timer_steal); + } + } + else if ( r->policy & runner_policy_rand ) { + qid = rand() % naq; + t = queue_gettask( queues[qid] , r->policy & runner_policy_block , 0 ); + } + else { + t = queue_gettask( &r->queues[threadID] , r->policy & runner_policy_block , 0 ); + } + TIMER_TOC(runner_timer_getpair); + + /* Did I get anything? */ + if ( t == NULL ) { + COUNT(runner_counter_stall); + if ( !stalled ) + stalled = getticks(); + continue; + } + #ifdef TIMER + else if ( stalled ) { + stalled = getticks() - stalled; + __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled ); + #ifdef TIMER_VERBOSE + printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 ); + fflush(stdout); + #endif + stalled = 0; + } + #endif + + /* Get the cells. */ + ci = t->ci; + cj = t->cj; + + /* Different types of tasks... */ + switch ( t->type ) { + case tid_self: + runner_doself( rt , ci ); + cell_unlocktree( ci ); + break; + case tid_pair: + runner_dopair( rt , ci , cj ); + cell_unlocktree( ci ); + cell_unlocktree( cj ); + break; + case tid_sort: + runner_dosort( rt , ci , t->flags ); + break; + case tid_sub: + runner_dosub( rt , ci , cj , t->flags ); + cell_unlocktree( ci ); + if ( cj != NULL ) + cell_unlocktree( cj ); + break; + default: + error( "Unknown task type." ); + } + + t->done = 1; + + /* Resolve any dependencies. */ + for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) + __sync_fetch_and_sub( &t->unlock_tasks[k]->wait , 1 ); + for ( k = 0 ; k < t->nr_unlock_cells ; k++ ) + __sync_fetch_and_sub( &t->unlock_cells[k]->wait , 1 ); + + } /* main loop. */ + + /* Any leftover stalls? */ + #ifdef TIMER + if ( stalled ) { + stalled = getticks() - stalled; + __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled ); + #ifdef TIMER_VERBOSE + printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 ); + fflush(stdout); + #endif + stalled = 0; + } + #endif + + } + + /* Be kind, rewind. */ + return NULL; + + } + + +/** + * @brief Let the #runner loose on the given #space. + * + * @param r The #runner. + * @param s The #space. + */ + +void runner_run ( struct runner *r , int sort_queues ) { + + int j, k; + struct space *s = r->s; + + /* Run throught the tasks and get all the waits right. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + s->tasks[k].done = 0; + for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) + s->tasks[k].unlock_tasks[j]->wait += 1; + for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ ) + s->tasks[k].unlock_cells[j]->wait += 1; + } + + /* Re-set the queues.*/ + if ( sort_queues ) { + #pragma omp parallel for default(none), shared(r) + for ( k = 0 ; k < r->nr_queues ; k++ ) { + queue_sort( &r->queues[k] ); + r->queues[k].next = 0; + } + } + else + for ( k = 0 ; k < r->nr_queues ; k++ ) + r->queues[k].next = 0; + + /* Cry havoc and let loose the dogs of war. */ + r->barrier_count = -r->barrier_count; + if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 ) + error( "Failed to broadcast barrier open condition." ); + + /* Sit back and wait for the runner_threads to come home. */ + while ( r->barrier_count < r->nr_threads ) + if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) + error( "Error while waiting for barrier." ); + + } + + +/** + * @brief Get a task free of dependencies and conflicts. + * + * @param q The task #queue. + * @param blocking Block until access to the queue is granted. + * @param keep Remove the returned task from this queue. + */ + +struct task *queue_gettask ( struct queue *q , int blocking , int keep ) { + + int k, tid = -1, qcount, *qtid = q->tid; + lock_type *qlock = &q->lock; + struct task *qtasks = q->tasks, *res = NULL; + TIMER_TIC + + /* If there are no tasks, leave immediately. */ + if ( q->next >= q->count ) { + TIMER_TOC(runner_timer_queue); + return NULL; + } + + /* Main loop, while there are tasks... */ + while ( q->next < q->count ) { + + /* Grab the task lock. */ + // if ( blocking ) { + if ( lock_lock( qlock ) != 0 ) + error( "Locking the task_lock failed.\n" ); + // } + // else { + // if ( lock_trylock( qlock ) != 0 ) + // break; + // } + + /* Loop over the remaining task IDs. */ + qcount = q->count; + for ( k = q->next ; k < qcount ; k++ ) { + + /* Put a finger on the task. */ + res = &qtasks[ qtid[k] ]; + + /* Is this task blocked? */ + if ( res->wait ) + continue; + + /* Different criteria for different types. */ + if ( res->type == tid_self || (res->type == tid_sub && res->cj == NULL) ) { + if ( res->ci->hold || cell_locktree( res->ci ) != 0 ) + continue; + } + else if ( res->type == tid_pair || (res->type == tid_sub && res->cj != NULL) ) { + if ( res->ci->hold || res->cj->hold || res->ci->wait || res->cj->wait ) + continue; + if ( cell_locktree( res->ci ) != 0 ) + continue; + if ( cell_locktree( res->cj ) != 0 ) { + cell_unlocktree( res->ci ); + continue; + } + } + + /* If we made it this far, we're safe. */ + break; + + } /* loop over the task IDs. */ + + /* Did we get a task? */ + if ( k < qcount ) { + + /* Do we need to swap? */ + if ( k != q->next ) + COUNT(runner_counter_swap); + + /* get the task ID. */ + tid = qtid[k]; + + /* Remove the task? */ + if ( keep ) { + + /* Bubble-up. */ + q->count = qcount - 1; + for ( ; k < qcount ; k++) + qtid[k] = qtid[k+1]; + + } + + /* No, leave it in the queue. */ + else { + + TIMER_TIC2 + + /* Bubble-down the task. */ + while ( k > q->next ) { + qtid[ k ] = qtid[ k-1 ]; + k -= 1; + } + qtid[ q->next ] = tid; + + /* up the counter. */ + q->next += 1; + + TIMER_TOC2(runner_timer_bubble); + + } + + } + + /* Release the task lock. */ + if ( lock_unlock( qlock ) != 0 ) + error( "Unlocking the task_lock failed.\n" ); + + /* Leave? */ + if ( tid >= 0 ) { + TIMER_TOC(runner_timer_queue); + return &qtasks[tid]; + } + else if ( !blocking ) + break; + + } /* while there are tasks. */ + + /* No beef. */ + TIMER_TOC(runner_timer_queue); + return NULL; + + } + + +/** + * @brief init a runner with the given number of threads, queues, and + * the given policy. + * + * @param r The #runner. + * @param s The #space in which this #runner will run. + * @param nr_threads The number of threads to spawn. + * @param nr_queues The number of task queues to create. + * @param policy The queueing policy to use. + */ + +void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy ) { + + int k, qid, nrq; + + /* Store the values. */ + r->s = s; + r->nr_threads = nr_threads; + r->nr_queues = nr_queues; + r->policy = policy; + + /* First of all, init the barrier and lock it. */ + if ( pthread_mutex_init( &r->barrier_mutex , NULL ) != 0 ) + error( "Failed to initialize barrier mutex." ); + if ( pthread_cond_init( &r->barrier_cond , NULL ) != 0 ) + error( "Failed to initialize barrier condition variable." ); + if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 ) + error( "Failed to lock barrier mutex." ); + r->barrier_count = 0; + + /* Allocate the queues. */ + if ( posix_memalign( (void *)(&r->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 ) + error( "Failed to allocate queues." ); + bzero( r->queues , nr_queues * sizeof(struct queue) ); + + /* Init the queues. */ + for ( k = 0 ; k < nr_queues ; k++ ) { + r->queues[k].size = s->nr_tasks; + if ( ( r->queues[k].tid = (int *)malloc( sizeof(int) * r->queues[k].size ) ) == NULL ) + error( "Failed to allocate queue tids." ); + r->queues[k].count = 0; + r->queues[k].next = 0; + if ( lock_init( &r->queues[k].lock ) != 0 ) + error( "Failed to init queue lock." ); + r->queues[k].tasks = s->tasks; + r->queues[k].r = r; + } + + /* Rank the tasks in topological order. */ + runner_ranktasks( r ); + + /* How many queues to fill initially? */ + for ( nrq = 0 , k = nr_queues ; k > 0 ; k = k / 2 ) + nrq += 1; + + /* Fill the queues (round-robin). */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + if ( s->tasks[ s->tasks_ind[k] ].type == tid_none) + continue; + // qid = 0; + // qid = k % nrq; + qid = k % nr_queues; + r->queues[qid].tid[ r->queues[qid].count ] = s->tasks_ind[k]; + r->queues[qid].count += 1; + } + + /* Sort the queues topologically. */ + for ( k = 0 ; k < nr_queues ; k++ ) + queue_sort( &r->queues[k] ); + + /* Allocate and init the threads. */ + if ( ( r->threads = (struct runner_thread *)malloc( sizeof(struct runner_thread) * nr_threads ) ) == NULL ) + error( "Failed to allocate threads array." ); + for ( k = 0 ; k < nr_threads ; k++ ) { + r->threads[k].id = k; + r->threads[k].r = r; + if ( pthread_create( &r->threads[k].thread , NULL , &runner_main , &r->threads[k] ) != 0 ) + error( "Failed to create runner thread." ); + } + + /* Wait for the runner threads to be in place. */ + while ( r->barrier_count != r->nr_threads ) + if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) + error( "Error while waiting for runner threads to get in place." ); + + } + + + diff --git a/runner.h b/runner.h new file mode 100644 index 0000000000000000000000000000000000000000..5f233ada759ee8bcb472a26aee7db6eb18fadb65 --- /dev/null +++ b/runner.h @@ -0,0 +1,212 @@ + +/* Some constants. */ +#define runner_policy_none 0 +#define runner_policy_rand 1 +#define runner_policy_steal 2 +#define runner_policy_keep 4 +#define runner_policy_block 8 + +#define runner_queue_scale 1.2 + + +/* The timers themselves. */ +enum { + runner_timer_none = 0, + runner_timer_dosort, + runner_timer_doself, + runner_timer_dopair, + runner_timer_dosub, + runner_timer_getpair, + runner_timer_queue, + runner_timer_tree, + runner_timer_bubble, + runner_timer_steal, + runner_timer_stalled, + runner_timer_count, + }; +extern ticks runner_timer[ runner_timer_count ]; + + +/* Define the timer macros. */ +#ifdef TIMER_VERBOSE + #define TIMER +#endif +#ifdef TIMER + #define TIMER_TIC ticks tic = getticks(); + #define TIMER_TOC(t) timer_toc( t , tic ) + #define TIMER_TIC2 ticks tic2 = getticks(); + #define TIMER_TOC2(t) timer_toc( t , tic2 ) + #ifndef INLINE + # if __GNUC__ && !__GNUC_STDC_INLINE__ + # define INLINE extern inline + # else + # define INLINE inline + # endif + #endif + INLINE ticks timer_toc ( int t , ticks tic ) { + ticks d = (getticks() - tic); + __sync_add_and_fetch( &runner_timer[t] , d ); + return d; + } +#else + #define TIMER_TIC + #define TIMER_TOC(t) +#endif + + +/* Counters. */ +enum { + runner_counter_swap = 0, + runner_counter_stall, + runner_counter_steal_stall, + runner_counter_steal_empty, + runner_counter_keep, + runner_counter_count, + }; +extern int runner_counter[ runner_counter_count ]; + + +/* Counter macros. */ +#ifdef COUNTER + #define COUNT(c) ( __sync_add_and_fetch( &runner_counter[ c ] , 1 ) ) +#else + #define COUNT(c) +#endif + +/* Get the inlining right. */ +#ifndef INLINE +# if __GNUC__ && !__GNUC_STDC_INLINE__ +# define INLINE extern inline +# else +# define INLINE inline +# endif +#endif + + +/** + * @brief Compute the 'interaction' between two particles. + * + * @param r2 the inter-particle radius squared. + * @param hi the screening distance of the ith particle. + * @param hj the screening distance of the jth particle. + * @param io Pointer to where to store the interaction of the ith particle. + * @param jo Pointer to where to store the interaction of the ith particle. + */ + +__attribute__ ((always_inline)) INLINE void iact ( float r2 , float hi , float hj , float *io , float *jo , int *ic , int *jc ) { + + #define KERNEL_COEFF_1 2.546479089470f + #define KERNEL_COEFF_2 15.278874536822f + #define KERNEL_COEFF_3 45.836623610466f + #define KERNEL_COEFF_4 30.557749073644f + #define KERNEL_COEFF_5 5.092958178941f + #define KERNEL_COEFF_6 (-15.278874536822f) + #define NORM_COEFF 4.188790204786f + + float r = sqrtf( r2 ); + float ui, uj, wi, wj; + + if ( r2 < hi*hi ) { + + ui = r / hi; + if ( ui < 0.5 ) + wi = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (ui - 1.0f) * ui * ui; + else + wi = KERNEL_COEFF_5 * (1.0f - ui) * (1.0f - ui) * (1.0 - ui); + if ( io != NULL ) + *io += NORM_COEFF * wi; + if ( ic != NULL ) + *ic += 1; + + } + + if ( r2 < hj*hj ) { + + uj = r / hj; + if ( uj < 0.5 ) + wj = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (uj - 1.0f) * uj * uj; + else + wj = KERNEL_COEFF_5 * (1.0f - uj) * (1.0f - uj) * (1.0 - uj); + if ( jo != NULL ) + *jo += NORM_COEFF * wj; + if ( jc != NULL ) + *jc += 1; + + } + + } + + + +/* A task queue. */ +struct queue { + + /* The lock to access this queue. */ + lock_type lock; + + /* Size, count and next element. */ + int size, count, next; + + /* The runner in which this queue lives. */ + struct runner *r; + + /* The actual tasks to which the indices refer. */ + struct task *tasks; + + /* The task indices. */ + int *tid; + + } __attribute__((aligned (64))); + + +/* A struct representing a runner's thread and its data. */ +struct runner_thread { + + /* The id of this thread. */ + int id; + + /* The thread which it is running. */ + pthread_t thread; + + /* The underlying runner. */ + struct runner *r; + + }; + + +/* Data structure for the runner. */ +struct runner { + + /* Number of threads on which to run. */ + int nr_threads; + + /* The space with which the runner is associated. */ + struct space *s; + + /* The runner's threads. */ + struct runner_thread *threads; + + /* The running policy. */ + int policy; + + /* The number of queues. */ + int nr_queues; + + /* The queues. */ + struct queue *queues; + + /* Data for the threads' barrier. */ + pthread_mutex_t barrier_mutex; + pthread_cond_t barrier_cond; + int barrier_count; + + }; + + +/* Function prototypes. */ +void runner_run ( struct runner *r , int sort_queues ); +void runner_dopair ( struct runner_thread *rt , struct cell *ci , struct cell *cj ); +void runner_doself ( struct runner_thread *rt , struct cell *c ); +void runner_dosort ( struct runner_thread *rt , struct cell *c , int flag ); +void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy ); +struct task *queue_gettask ( struct queue *q , int blocking , int keep ); diff --git a/snap_C09/extract.m b/snap_C09/extract.m new file mode 100644 index 0000000000000000000000000000000000000000..c167dfef1eb072d9b3911c15260e8f24043fd7dd --- /dev/null +++ b/snap_C09/extract.m @@ -0,0 +1,35 @@ + +% Remove any old data +!rm -f *.txt + +% Loop over the input files +range = [ Inf , -Inf ]; +avg = [ 0 , 0 , 0 ]; +count = 0; +shift = [ 4.331456e+01 , 4.097030e+01 , 4.383432e+01 ]; +for i=0:15 + + % Get the file name + fname = sprintf( 'snap_C09_200_000.%i.hdf5' , i ); + + % Get the coordinates + coord = double( h5read( fname , '/PartType0/Coordinates' )' ); + coord = coord - repmat( shift , size( coord , 1 ) , 1 ); + save Coordinates.txt -ascii -double -append coord + + % Adjust the range + range(1) = min( range(1) , min(min(coord)) ); + range(2) = max( range(2) , max(max(coord)) ); + avg = avg + sum( coord , 1 ); + count = count + size( coord , 1 ); + + % Get the smoothing lengths + h = double( h5read( fname , '/PartType0/SmoothingLength' ) ); + save SmoothingLength.txt -ascii -double -append h + +end + +% Display some statistics +disp( sprintf( 'read %i particles' , count ) ); +disp( sprintf( 'range of coords is [ %e , %e ]' , range(1) , range(2) ) ); +disp( sprintf( 'avg position is [ %e , %e , %e ]' , avg(1)/count , avg(2)/count , avg(3)/count ) ); diff --git a/space.c b/space.c new file mode 100644 index 0000000000000000000000000000000000000000..afed2e76a79d170e3d6cb1a029260017c8535bc1 --- /dev/null +++ b/space.c @@ -0,0 +1,1313 @@ + + +/* Some standard headers. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <float.h> +#include <limits.h> +#include <math.h> + +/* Local headers. */ +#include "cycle.h" +#include "lock.h" +#include "space.h" +#include "runner.h" + +/* Error macro. */ +#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); } + +/* Convert cell location to ID. */ +#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) + +/* Task type names. */ +const char *taskID_names[tid_count] = { "none" , "sort" , "self" , "pair" , "sub" }; + +/* Map shift vector to sortlist. */ +const int sortlistID[27] = { + /* ( -1 , -1 , -1 ) */ 0 , + /* ( -1 , -1 , 0 ) */ 1 , + /* ( -1 , -1 , 1 ) */ 2 , + /* ( -1 , 0 , -1 ) */ 3 , + /* ( -1 , 0 , 0 ) */ 4 , + /* ( -1 , 0 , 1 ) */ 5 , + /* ( -1 , 1 , -1 ) */ 6 , + /* ( -1 , 1 , 0 ) */ 7 , + /* ( -1 , 1 , 1 ) */ 8 , + /* ( 0 , -1 , -1 ) */ 9 , + /* ( 0 , -1 , 0 ) */ 10 , + /* ( 0 , -1 , 1 ) */ 11 , + /* ( 0 , 0 , -1 ) */ 12 , + /* ( 0 , 0 , 0 ) */ 0 , + /* ( 0 , 0 , 1 ) */ 12 , + /* ( 0 , 1 , -1 ) */ 11 , + /* ( 0 , 1 , 0 ) */ 10 , + /* ( 0 , 1 , 1 ) */ 9 , + /* ( 1 , -1 , -1 ) */ 8 , + /* ( 1 , -1 , 0 ) */ 7 , + /* ( 1 , -1 , 1 ) */ 6 , + /* ( 1 , 0 , -1 ) */ 5 , + /* ( 1 , 0 , 0 ) */ 4 , + /* ( 1 , 0 , 1 ) */ 3 , + /* ( 1 , 1 , -1 ) */ 2 , + /* ( 1 , 1 , 0 ) */ 1 , + /* ( 1 , 1 , 1 ) */ 0 + }; + + +/** + * @brief Mapping function to draw a specific cell (gnuplot). + */ + +void space_map_clearsort ( struct cell *c , void *data ) { + + if ( c->sort != NULL ) { + free( c->sort ); + c->sort = NULL; + } + + } + + +/** + * @brief Get a task free of dependencies and conflicts. + * + * @param s The #space. + */ + +struct task *space_gettask ( struct space *s ) { + + int k, tid = -1; + struct task *res = NULL; + struct cell *c; + + /* Main loop, while there are tasks... */ + while ( s->next_task < s->nr_tasks ) { + + /* Grab the task lock. */ + if ( lock_lock( &s->task_lock ) != 0 ) + error( "Locking the task_lock failed.\n" ); + + /* Loop over the remaining task IDs. */ + for ( k = s->next_task ; k < s->nr_tasks ; k++ ) { + + /* Put a finger on the task. */ + res = &s->tasks[ s->tasks_ind[k] ]; + + /* Is this task blocked? */ + if ( res->wait ) + continue; + + /* Different criteria for different types. */ + switch ( res->type ) { + case tid_self: + if ( res->ci->lock || res->ci->wait > 0 ) + continue; + break; + case tid_pair: + if ( res->ci->lock || res->cj->lock || res->ci->wait || res->cj->wait ) + continue; + break; + case tid_sort: + if ( res->ci->lock ) + continue; + break; + } + + /* If we made it this far, we're safe. */ + break; + + } /* loop over the task IDs. */ + + /* Did we get a task? */ + if ( k < s->nr_tasks ) { + + // /* Swap to front. */ + // tid = s->tasks_ind[k]; + // s->tasks_ind[k] = s->tasks_ind[ s->next_task ]; + // s->tasks_ind[ s->next_task ] = tid; + + /* Bubble-down the task. */ + tid = s->tasks_ind[k]; + while ( k > s->next_task ) { + s->tasks_ind[ k ] = s->tasks_ind[ k-1 ]; + k -= 1; + } + s->tasks_ind[ s->next_task ] = tid; + + /* Lock the cells, if needed. */ + if ( s->tasks[tid].type != tid_sort ) { + for ( c = res->ci ; c != NULL ; c = c->parent ) + __sync_fetch_and_add( &c->lock , 1 ); + for ( c = res->cj ; c != NULL ; c = c->parent ) + __sync_fetch_and_add( &c->lock , 1 ); + } + + /* up the counter. */ + s->next_task += 1; + + } + + /* Release the task lock. */ + if ( lock_unlock( &s->task_lock ) != 0 ) + error( "Locking the task_lock failed.\n" ); + + /* Leave? */ + if ( tid >= 0 ) + return &s->tasks[tid]; + + } /* while there are tasks. */ + + /* No beef. */ + return NULL; + + } + + +/** + * @brief Map a function to all particles in a aspace. + * + * @param s The #space we are working in. + */ + +void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data ) { + + int i; + + void rec_map ( struct cell *c ) { + + int k; + + /* No progeny? */ + if ( !c->split ) + for ( k = 0 ; k < c->count ; k++ ) + fun( &c->parts[k] , c , data ); + + /* Otherwise, recurse. */ + else + for ( k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + rec_map( c->progeny[k] ); + + } + + /* Call the recursive function on all higher-level cells. */ + for ( i = 0 ; i < s->nr_cells ; i++ ) + rec_map( &s->cells[i] ); + + } + + +/** + * @brief Map a function to all particles in a aspace. + * + * @param s The #space we are working in. + */ + +void space_map_cells ( struct space *s , void (*fun)( struct cell *c , void *data ) , void *data ) { + + int i; + + void rec_map ( struct cell *c ) { + + int k; + + /* No progeny? */ + if ( !c->split ) + fun( c , data ); + + /* Otherwise, recurse. */ + else + for ( k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + rec_map( c->progeny[k] ); + + } + + /* Call the recursive function on all higher-level cells. */ + for ( i = 0 ; i < s->nr_cells ; i++ ) + rec_map( &s->cells[i] ); + + } + + +/** + * @brief Add a #task to the #space. + * + * @param s The #space we are working in. + */ + +struct task *space_addtask ( struct space *s , int type , int flags , int wait , struct cell *ci , struct cell *cj , struct task *unlock_tasks[] , int nr_unlock_tasks , struct cell *unlock_cells[] , int nr_unlock_cells ) { + + struct task *t = &s->tasks[ s->nr_tasks ]; + + /* Copy the data. */ + t->type = type; + t->flags = flags; + t->wait = wait; + t->ci = ci; + t->cj = cj; + if ( unlock_tasks != NULL ) + memcpy( t->unlock_tasks , unlock_tasks , sizeof(struct task *) * nr_unlock_tasks ); + t->nr_unlock_tasks = nr_unlock_tasks; + if ( unlock_cells != NULL ) + memcpy( t->unlock_cells , unlock_cells , sizeof(struct task *) * nr_unlock_cells ); + t->nr_unlock_cells = nr_unlock_cells; + + /* Increase the task counter. */ + s->nr_tasks += 1; + + /* Return a pointer to the new task. */ + return t; + + } + + + +/** + * @brief Remove an unlock_task from the given task. + * + * @param ta The unlocking #task. + * @param tb The #task that will be unlocked. + */ + +void task_rmunlock( struct task *ta , struct task *tb ) { + + int k; + + for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ ) + if ( ta->unlock_tasks[k] == tb ) { + ta->nr_unlock_tasks -= 1; + ta->unlock_tasks[k] = ta->unlock_tasks[ ta->nr_unlock_tasks ]; + return; + } + error( "Task not found." ); + + } + + +/** + * @brief Add an unlock_task to the given task. + * + * @param ta The unlocking #task. + * @param tb The #task that will be unlocked. + */ + +void task_addunlock( struct task *ta , struct task *tb ) { + + int k; + + /* Bogus? */ + if ( ta == NULL || tb == NULL ) + return; + + /* Check if ta already unlocks tb. */ + for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ ) + if ( ta->unlock_tasks[k] == tb ) + return; + + if ( ta->nr_unlock_tasks == task_maxunlock ) + error( "Too many unlock_tasks in task." ); + + ta->unlock_tasks[ ta->nr_unlock_tasks] = tb; + ta->nr_unlock_tasks += 1; + + } + + +/** + * @brief Split tasks that may be too large. + * + * @param s The #space we are working in. + */ + +void space_splittasks ( struct space *s ) { + + int k, sid, tid; + struct cell *ci, *cj; + double hi, hj, shift[3]; + struct task *t; + + /* Loop through the tasks... */ + for ( tid = 0 ; tid < s->nr_tasks ; tid++ ) { + + /* Get a pointer on the task. */ + t = &s->tasks[tid]; + + /* If this task isn't a pair, i'm not interested. */ + if ( t->type != tid_pair ) + continue; + + /* Get a handle on the cells involved. */ + ci = t->ci; + cj = t->cj; + hi = fmax( ci->h[0] , fmax( ci->h[1] , ci->h[2] ) ); + hj = fmax( cj->h[0] , fmax( cj->h[1] , cj->h[2] ) ); + + /* Should this task be split-up? */ + if ( ci->split && cj->split && ci->r_max < hi/2 && cj->r_max < hj/2 ) { + + /* Get the relative distance between the pairs, wrapping. */ + for ( k = 0 ; k < 3 ; k++ ) { + if ( cj->loc[k] - ci->loc[k] < -s->dim[k]/2 ) + shift[k] = s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > s->dim[k]/2 ) + shift[k] = -s->dim[k]; + else + shift[k] = 0.0; + } + + /* Get the sorting index. */ + for ( sid = 0 , k = 0 ; k < 3 ; k++ ) + sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 ); + + /* Flip? */ + if ( sid < 13 ) { + cj = t->ci; + ci = t->cj; + t->ci = ci; t->cj = cj; + } + else + sid = 26 - sid; + + /* Remove the dependency of this task on the sorts of ci and cj. */ + task_rmunlock( ci->sorts[sid] , t ); + task_rmunlock( cj->sorts[sid] , t ); + ci->nr_pairs -= 1; + cj->nr_pairs -= 1; + t->nr_unlock_cells = 0; + + /* For each different sorting type... */ + switch ( sid ) { + + case 0: /* ( 1 , 1 , 1 ) */ + t->ci = ci->progeny[7]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[7]->sorts[0] , t ); task_addunlock( cj->progeny[0]->sorts[0] , t ); + ci->progeny[7]->nr_pairs += 1; + cj->progeny[0]->nr_pairs += 1; + break; + + case 1: /* ( 1 , 1 , 0 ) */ + if ( !ci->progeny[6]->split && !ci->progeny[7]->split && + !cj->progeny[0]->split && !cj->progeny[1]->split ) { + t->type = tid_sub; t->flags = 1; + task_addunlock( ci->progeny[6]->sorts[1] , t ); task_addunlock( cj->progeny[0]->sorts[1] , t ); + task_addunlock( ci->progeny[7]->sorts[1] , t ); task_addunlock( cj->progeny[1]->sorts[1] , t ); + task_addunlock( ci->progeny[6]->sorts[0] , t ); task_addunlock( cj->progeny[1]->sorts[0] , t ); + task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t ); + } + else { + t->ci = ci->progeny[6]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[6]->sorts[1] , t ); task_addunlock( cj->progeny[0]->sorts[1] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[1] , t ); task_addunlock( cj->progeny[1]->sorts[1] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[0] , t ); task_addunlock( cj->progeny[1]->sorts[0] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t ); + } + ci->progeny[6]->nr_pairs += 2; + ci->progeny[7]->nr_pairs += 2; + cj->progeny[0]->nr_pairs += 2; + cj->progeny[1]->nr_pairs += 2; + break; + + case 2: /* ( 1 , 1 , -1 ) */ + t->ci = ci->progeny[6]; t->cj = cj->progeny[1]; + task_addunlock( ci->progeny[6]->sorts[2] , t ); task_addunlock( cj->progeny[1]->sorts[2] , t ); + ci->progeny[6]->nr_pairs += 1; + cj->progeny[1]->nr_pairs += 1; + break; + + case 3: /* ( 1 , 0 , 1 ) */ + if ( !ci->progeny[5]->split && !ci->progeny[7]->split && + !cj->progeny[0]->split && !cj->progeny[2]->split ) { + t->type = tid_sub; t->flags = 3; + task_addunlock( ci->progeny[5]->sorts[3] , t ); task_addunlock( cj->progeny[0]->sorts[3] , t ); + task_addunlock( ci->progeny[7]->sorts[3] , t ); task_addunlock( cj->progeny[2]->sorts[3] , t ); + task_addunlock( ci->progeny[5]->sorts[0] , t ); task_addunlock( cj->progeny[2]->sorts[0] , t ); + task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t ); + } + else { + t->ci = ci->progeny[5]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[5]->sorts[3] , t ); task_addunlock( cj->progeny[0]->sorts[3] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[3] , t ); task_addunlock( cj->progeny[2]->sorts[3] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[0] , t ); task_addunlock( cj->progeny[2]->sorts[0] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t ); + } + ci->progeny[5]->nr_pairs += 2; + ci->progeny[7]->nr_pairs += 2; + cj->progeny[0]->nr_pairs += 2; + cj->progeny[2]->nr_pairs += 2; + break; + + case 4: /* ( 1 , 0 , 0 ) */ + if ( !ci->progeny[4]->split && !ci->progeny[5]->split && !ci->progeny[6]->split && !ci->progeny[7]->split && + !cj->progeny[0]->split && !cj->progeny[1]->split && !cj->progeny[2]->split && !cj->progeny[3]->split) { + t->type = tid_sub; t->flags = 4; + task_addunlock( ci->progeny[4]->sorts[4] , t ); task_addunlock( cj->progeny[0]->sorts[4] , t ); + task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t ); + task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t ); + task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t ); + task_addunlock( ci->progeny[4]->sorts[3] , t ); task_addunlock( cj->progeny[1]->sorts[3] , t ); + task_addunlock( ci->progeny[5]->sorts[4] , t ); task_addunlock( cj->progeny[1]->sorts[4] , t ); + task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t ); + task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t ); + task_addunlock( ci->progeny[4]->sorts[1] , t ); task_addunlock( cj->progeny[2]->sorts[1] , t ); + task_addunlock( ci->progeny[5]->sorts[2] , t ); task_addunlock( cj->progeny[2]->sorts[2] , t ); + task_addunlock( ci->progeny[6]->sorts[4] , t ); task_addunlock( cj->progeny[2]->sorts[4] , t ); + task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t ); + task_addunlock( ci->progeny[4]->sorts[0] , t ); task_addunlock( cj->progeny[3]->sorts[0] , t ); + task_addunlock( ci->progeny[5]->sorts[1] , t ); task_addunlock( cj->progeny[3]->sorts[1] , t ); + task_addunlock( ci->progeny[6]->sorts[3] , t ); task_addunlock( cj->progeny[3]->sorts[3] , t ); + task_addunlock( ci->progeny[7]->sorts[4] , t ); task_addunlock( cj->progeny[3]->sorts[4] , t ); + } + else { + t->ci = ci->progeny[4]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[4]->sorts[4] , t ); task_addunlock( cj->progeny[0]->sorts[4] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[4]->sorts[3] , t ); task_addunlock( cj->progeny[1]->sorts[3] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[4] , t ); task_addunlock( cj->progeny[1]->sorts[4] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[4]->sorts[1] , t ); task_addunlock( cj->progeny[2]->sorts[1] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[2] , t ); task_addunlock( cj->progeny[2]->sorts[2] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[4] , t ); task_addunlock( cj->progeny[2]->sorts[4] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[4]->sorts[0] , t ); task_addunlock( cj->progeny[3]->sorts[0] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[1] , t ); task_addunlock( cj->progeny[3]->sorts[1] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[3] , t ); task_addunlock( cj->progeny[3]->sorts[3] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[4] , t ); task_addunlock( cj->progeny[3]->sorts[4] , t ); + } + ci->progeny[4]->nr_pairs += 4; + ci->progeny[5]->nr_pairs += 4; + ci->progeny[6]->nr_pairs += 4; + ci->progeny[7]->nr_pairs += 4; + cj->progeny[0]->nr_pairs += 4; + cj->progeny[1]->nr_pairs += 4; + cj->progeny[2]->nr_pairs += 4; + cj->progeny[3]->nr_pairs += 4; + break; + + case 5: /* ( 1 , 0 , -1 ) */ + if ( !ci->progeny[4]->split && !ci->progeny[6]->split && + !cj->progeny[1]->split && !cj->progeny[3]->split ) { + t->type = tid_sub; t->flags = 5; + task_addunlock( ci->progeny[4]->sorts[5] , t ); task_addunlock( cj->progeny[1]->sorts[5] , t ); + task_addunlock( ci->progeny[6]->sorts[5] , t ); task_addunlock( cj->progeny[3]->sorts[5] , t ); + task_addunlock( ci->progeny[4]->sorts[2] , t ); task_addunlock( cj->progeny[3]->sorts[2] , t ); + task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t ); + } + else { + t->ci = ci->progeny[4]; t->cj = cj->progeny[1]; + task_addunlock( ci->progeny[4]->sorts[5] , t ); task_addunlock( cj->progeny[1]->sorts[5] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[5] , t ); task_addunlock( cj->progeny[3]->sorts[5] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[4]->sorts[2] , t ); task_addunlock( cj->progeny[3]->sorts[2] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t ); + } + ci->progeny[4]->nr_pairs += 2; + ci->progeny[6]->nr_pairs += 2; + cj->progeny[1]->nr_pairs += 2; + cj->progeny[3]->nr_pairs += 2; + break; + + case 6: /* ( 1 , -1 , 1 ) */ + t->ci = ci->progeny[5]; t->cj = cj->progeny[2]; + task_addunlock( ci->progeny[5]->sorts[6] , t ); task_addunlock( cj->progeny[2]->sorts[6] , t ); + ci->progeny[5]->nr_pairs += 1; + cj->progeny[2]->nr_pairs += 1; + break; + + case 7: /* ( 1 , -1 , 0 ) */ + if ( !ci->progeny[4]->split && !ci->progeny[5]->split && + !cj->progeny[2]->split && !cj->progeny[3]->split ) { + t->type = tid_sub; t->flags = 7; + task_addunlock( ci->progeny[4]->sorts[6] , t ); task_addunlock( cj->progeny[3]->sorts[6] , t ); + task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t ); + task_addunlock( ci->progeny[4]->sorts[7] , t ); task_addunlock( cj->progeny[2]->sorts[7] , t ); + task_addunlock( ci->progeny[5]->sorts[7] , t ); task_addunlock( cj->progeny[3]->sorts[7] , t ); + } + else { + t->ci = ci->progeny[4]; t->cj = cj->progeny[3]; + task_addunlock( ci->progeny[4]->sorts[6] , t ); task_addunlock( cj->progeny[3]->sorts[6] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[4]->sorts[7] , t ); task_addunlock( cj->progeny[2]->sorts[7] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[3] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[7] , t ); task_addunlock( cj->progeny[3]->sorts[7] , t ); + } + ci->progeny[4]->nr_pairs += 2; + ci->progeny[5]->nr_pairs += 2; + cj->progeny[2]->nr_pairs += 2; + cj->progeny[3]->nr_pairs += 2; + break; + + case 8: /* ( 1 , -1 , -1 ) */ + t->ci = ci->progeny[4]; t->cj = cj->progeny[3]; + task_addunlock( ci->progeny[4]->sorts[8] , t ); task_addunlock( cj->progeny[3]->sorts[8] , t ); + ci->progeny[4]->nr_pairs += 1; + cj->progeny[3]->nr_pairs += 1; + break; + + case 9: /* ( 0 , 1 , 1 ) */ + if ( !ci->progeny[3]->split && !ci->progeny[7]->split && + !cj->progeny[0]->split && !cj->progeny[4]->split ) { + t->type = tid_sub; t->flags = 9; + task_addunlock( ci->progeny[3]->sorts[9] , t ); task_addunlock( cj->progeny[0]->sorts[9] , t ); + task_addunlock( ci->progeny[7]->sorts[9] , t ); task_addunlock( cj->progeny[4]->sorts[9] , t ); + task_addunlock( ci->progeny[3]->sorts[0] , t ); task_addunlock( cj->progeny[4]->sorts[0] , t ); + task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t ); + } + else { + t->ci = ci->progeny[3]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[3]->sorts[9] , t ); task_addunlock( cj->progeny[0]->sorts[9] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[9] , t ); task_addunlock( cj->progeny[4]->sorts[9] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[0] , t ); task_addunlock( cj->progeny[4]->sorts[0] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t ); + } + ci->progeny[3]->nr_pairs += 2; + ci->progeny[7]->nr_pairs += 2; + cj->progeny[0]->nr_pairs += 2; + cj->progeny[4]->nr_pairs += 2; + break; + + case 10: /* ( 0 , 1 , 0 ) */ + if ( !ci->progeny[2]->split && !ci->progeny[3]->split && !ci->progeny[6]->split && !ci->progeny[7]->split && + !cj->progeny[0]->split && !cj->progeny[1]->split && !cj->progeny[4]->split && !cj->progeny[5]->split) { + t->type = tid_sub; t->flags = 10; + task_addunlock( ci->progeny[2]->sorts[10] , t ); task_addunlock( cj->progeny[0]->sorts[10] , t ); + task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t ); + task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t ); + task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t ); + task_addunlock( ci->progeny[2]->sorts[9] , t ); task_addunlock( cj->progeny[1]->sorts[9] , t ); + task_addunlock( ci->progeny[3]->sorts[10] , t ); task_addunlock( cj->progeny[1]->sorts[10] , t ); + task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t ); + task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t ); + task_addunlock( ci->progeny[2]->sorts[1] , t ); task_addunlock( cj->progeny[4]->sorts[1] , t ); + task_addunlock( ci->progeny[3]->sorts[2] , t ); task_addunlock( cj->progeny[4]->sorts[2] , t ); + task_addunlock( ci->progeny[6]->sorts[10] , t ); task_addunlock( cj->progeny[4]->sorts[10] , t ); + task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t ); + task_addunlock( ci->progeny[2]->sorts[0] , t ); task_addunlock( cj->progeny[5]->sorts[0] , t ); + task_addunlock( ci->progeny[3]->sorts[1] , t ); task_addunlock( cj->progeny[5]->sorts[1] , t ); + task_addunlock( ci->progeny[6]->sorts[9] , t ); task_addunlock( cj->progeny[5]->sorts[9] , t ); + task_addunlock( ci->progeny[7]->sorts[10] , t ); task_addunlock( cj->progeny[5]->sorts[10] , t ); + } + else { + t->ci = ci->progeny[2]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[2]->sorts[10] , t ); task_addunlock( cj->progeny[0]->sorts[10] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[2]->sorts[9] , t ); task_addunlock( cj->progeny[1]->sorts[9] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[10] , t ); task_addunlock( cj->progeny[1]->sorts[10] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[2]->sorts[1] , t ); task_addunlock( cj->progeny[4]->sorts[1] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[2] , t ); task_addunlock( cj->progeny[4]->sorts[2] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[10] , t ); task_addunlock( cj->progeny[4]->sorts[10] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[5] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[2]->sorts[0] , t ); task_addunlock( cj->progeny[5]->sorts[0] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[5] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[1] , t ); task_addunlock( cj->progeny[5]->sorts[1] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[5] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[9] , t ); task_addunlock( cj->progeny[5]->sorts[9] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[5] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[10] , t ); task_addunlock( cj->progeny[5]->sorts[10] , t ); + } + ci->progeny[2]->nr_pairs += 4; + ci->progeny[3]->nr_pairs += 4; + ci->progeny[6]->nr_pairs += 4; + ci->progeny[7]->nr_pairs += 4; + cj->progeny[0]->nr_pairs += 4; + cj->progeny[1]->nr_pairs += 4; + cj->progeny[4]->nr_pairs += 4; + cj->progeny[5]->nr_pairs += 4; + break; + + case 11: /* ( 0 , 1 , -1 ) */ + if ( !ci->progeny[2]->split && !ci->progeny[6]->split && + !cj->progeny[1]->split && !cj->progeny[5]->split ) { + t->type = tid_sub; t->flags = 11; + task_addunlock( ci->progeny[2]->sorts[11] , t ); task_addunlock( cj->progeny[1]->sorts[11] , t ); + task_addunlock( ci->progeny[6]->sorts[11] , t ); task_addunlock( cj->progeny[5]->sorts[11] , t ); + task_addunlock( ci->progeny[2]->sorts[2] , t ); task_addunlock( cj->progeny[5]->sorts[2] , t ); + task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t ); + } + else { + t->ci = ci->progeny[2]; t->cj = cj->progeny[1]; + task_addunlock( ci->progeny[2]->sorts[11] , t ); task_addunlock( cj->progeny[1]->sorts[11] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[5] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[11] , t ); task_addunlock( cj->progeny[5]->sorts[11] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[5] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[2]->sorts[2] , t ); task_addunlock( cj->progeny[5]->sorts[2] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t ); + } + ci->progeny[2]->nr_pairs += 2; + ci->progeny[6]->nr_pairs += 2; + cj->progeny[1]->nr_pairs += 2; + cj->progeny[5]->nr_pairs += 2; + break; + + case 12: /* ( 0 , 0 , 1 ) */ + if ( !ci->progeny[1]->split && !ci->progeny[3]->split && !ci->progeny[5]->split && !ci->progeny[7]->split && + !cj->progeny[0]->split && !cj->progeny[2]->split && !cj->progeny[4]->split && !cj->progeny[6]->split) { + t->type = tid_sub; t->flags = 12; + task_addunlock( ci->progeny[1]->sorts[12] , t ); task_addunlock( cj->progeny[0]->sorts[12] , t ); + task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t ); + task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t ); + task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t ); + task_addunlock( ci->progeny[1]->sorts[9] , t ); task_addunlock( cj->progeny[2]->sorts[9] , t ); + task_addunlock( ci->progeny[3]->sorts[12] , t ); task_addunlock( cj->progeny[2]->sorts[12] , t ); + task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t ); + task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t ); + task_addunlock( ci->progeny[1]->sorts[3] , t ); task_addunlock( cj->progeny[4]->sorts[3] , t ); + task_addunlock( ci->progeny[3]->sorts[6] , t ); task_addunlock( cj->progeny[4]->sorts[6] , t ); + task_addunlock( ci->progeny[5]->sorts[12] , t ); task_addunlock( cj->progeny[4]->sorts[12] , t ); + task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t ); + task_addunlock( ci->progeny[1]->sorts[0] , t ); task_addunlock( cj->progeny[6]->sorts[0] , t ); + task_addunlock( ci->progeny[3]->sorts[3] , t ); task_addunlock( cj->progeny[6]->sorts[3] , t ); + task_addunlock( ci->progeny[5]->sorts[9] , t ); task_addunlock( cj->progeny[6]->sorts[9] , t ); + task_addunlock( ci->progeny[7]->sorts[12] , t ); task_addunlock( cj->progeny[6]->sorts[12] , t ); + } + else { + t->ci = ci->progeny[1]; t->cj = cj->progeny[0]; + task_addunlock( ci->progeny[1]->sorts[12] , t ); task_addunlock( cj->progeny[0]->sorts[12] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[1] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[1]->sorts[9] , t ); task_addunlock( cj->progeny[2]->sorts[9] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[12] , t ); task_addunlock( cj->progeny[2]->sorts[12] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[2] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[1] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[1]->sorts[3] , t ); task_addunlock( cj->progeny[4]->sorts[3] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[6] , t ); task_addunlock( cj->progeny[4]->sorts[6] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[12] , t ); task_addunlock( cj->progeny[4]->sorts[12] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[4] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[1] , cj->progeny[6] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[1]->sorts[0] , t ); task_addunlock( cj->progeny[6]->sorts[0] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[6] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[3]->sorts[3] , t ); task_addunlock( cj->progeny[6]->sorts[3] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[6] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[5]->sorts[9] , t ); task_addunlock( cj->progeny[6]->sorts[9] , t ); + t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[6] , NULL , 0 , NULL , 0 ); + task_addunlock( ci->progeny[7]->sorts[12] , t ); task_addunlock( cj->progeny[6]->sorts[12] , t ); + } + ci->progeny[1]->nr_pairs += 4; + ci->progeny[3]->nr_pairs += 4; + ci->progeny[5]->nr_pairs += 4; + ci->progeny[7]->nr_pairs += 4; + cj->progeny[0]->nr_pairs += 4; + cj->progeny[2]->nr_pairs += 4; + cj->progeny[4]->nr_pairs += 4; + cj->progeny[6]->nr_pairs += 4; + break; + + } + + /* Take a step back... */ + tid -= 1; + + } /* split this task? */ + + } /* loop over all tasks. */ + + } + + +/** + * @brief Fill the #space's task list. + * + * @param s The #space we are working in. + * @param do_sort Flag to add sorting tasks to the list. + */ + +void space_maketasks ( struct space *s , int do_sort ) { + + int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd; + int *cdim = s->cdim; + int nr_tasks_old = s->nr_tasks; + struct task *t; + int pts[7][8] = { { -1 , 12 , 10 , 9 , 4 , 3 , 1 , 0 } , + { -1 , -1 , 11 , 10 , 5 , 4 , 2 , 1 } , + { -1 , -1 , -1 , 12 , 7 , 6 , 4 , 3 } , + { -1 , -1 , -1 , -1 , 8 , 7 , 5 , 4 } , + { -1 , -1 , -1 , -1 , -1 , 12 , 10 , 9 } , + { -1 , -1 , -1 , -1 , -1 , -1 , 11 , 10 } , + { -1 , -1 , -1 , -1 , -1 , -1 , -1 , 12 } }; + int counts[tid_count]; + + /* Recursive function to generate tasks in the cell tree. */ + void maketasks_rec ( struct cell *c , struct task *sort_up[] , int nr_sort_up , struct cell *parent ) { + + int j, k, nr_sort = 0; + struct task *sort[14], *t; + + /* Clear the waits on this cell. */ + c->wait = 0; + sort[0] = NULL; + + /* Start by generating the sort task. */ + if ( c->count > 0 ) { + + if ( do_sort ) { + if ( c->count < 500 ) { + sort[0] = space_addtask( s , tid_sort , 0x1fff , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + for ( k = 0 ; k < 13 ; k++ ) + c->sorts[k] = sort[0]; + nr_sort = 1; + } + else if ( c->count < 2000 ) { + sort[0] = space_addtask( s , tid_sort , 0xf , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + sort[1] = space_addtask( s , tid_sort , 0xf0 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + sort[2] = space_addtask( s , tid_sort , 0x1f00 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + for ( k = 0 ; k < 4 ; k++ ) + c->sorts[k] = sort[0]; + for ( k = 4 ; k < 8 ; k++ ) + c->sorts[k] = sort[1]; + for ( k = 8 ; k < 13 ; k++ ) + c->sorts[k] = sort[2]; + nr_sort = 3; + } + else { + c->sorts[0] = sort[0] = space_addtask( s , tid_sort , 0x1 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[1] = sort[1] = space_addtask( s , tid_sort , 0x2 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[2] = sort[2] = space_addtask( s , tid_sort , 0x4 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[3] = sort[3] = space_addtask( s , tid_sort , 0x8 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[4] = sort[4] = space_addtask( s , tid_sort , 0x10 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[5] = sort[5] = space_addtask( s , tid_sort , 0x20 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[6] = sort[6] = space_addtask( s , tid_sort , 0x40 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[7] = sort[7] = space_addtask( s , tid_sort , 0x80 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[8] = sort[8] = space_addtask( s , tid_sort , 0x100 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[9] = sort[9] = space_addtask( s , tid_sort , 0x200 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[10] = sort[10] = space_addtask( s , tid_sort , 0x400 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[11] = sort[11] = space_addtask( s , tid_sort , 0x800 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + c->sorts[12] = sort[12] = space_addtask( s , tid_sort , 0x1000 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 ); + nr_sort = 13; + } + } + + /* Generate a self-interaction if not split. */ + if ( !c->split && c->count > 1 ) + space_addtask( s , tid_self , 0 , 0 , c , NULL , NULL , 0 , NULL , 0 ); + + } + + /* Otherwise, add the interactions between progeny. */ + if ( c->split ) { + + /* Recurse. */ + for ( k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + maketasks_rec( c->progeny[k] , sort , nr_sort , c ); + + /* Worth splitting into several tasks? */ + if ( c->count > 2*space_splitsize ) { + + /* Make a task for eac pair of progeny. */ + for ( j = 0 ; j < 8 ; j++ ) + if ( c->progeny[j] != NULL && c->progeny[j]->count > 0 ) + for ( k = j + 1 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) { + t = space_addtask( s , tid_pair , 0 , 0 , c->progeny[j] , c->progeny[k] , NULL , 0 , NULL , 0 ); + task_addunlock( c->progeny[j]->sorts[ pts[j][k] ] , t ); + task_addunlock( c->progeny[k]->sorts[ pts[j][k] ] , t ); + c->progeny[k]->nr_pairs += 1; + c->progeny[j]->nr_pairs += 1; + } + + } + + /* Otherwise, dispatch as one large task. */ + else { + + /* Add the task. */ + t = space_addtask( s , tid_sub , 0 , 0 , c , NULL , NULL , 0 , NULL , 0 ); + + /* Make it depend on all the sorts of its progeny. */ + for ( k = 0 ; k < 8 ; k++ ) + for ( j = 0 ; j < 13 ; j++ ) + task_addunlock( c->progeny[k]->sorts[j] , t ); + + } + + } + + } + + /* Allocate the task-list, if needed. */ + if ( s->tasks == NULL ) + if ( posix_memalign( (void *)&s->tasks , 64 , sizeof(struct task) * s->tot_cells * 14 ) != 0 ) + error( "Failed to allocate task list." ); + s->nr_tasks = 0; + + /* Loop over the cells and get their sub-tasks. */ + for ( k = 0 ; k < s->nr_cells ; k++ ) + maketasks_rec( &s->cells[k] , NULL , 0 , NULL ); + + /* Run through the highest level of cells and add pairs. */ + for ( i = 0 ; i < cdim[0] ; i++ ) + for ( j = 0 ; j < cdim[1] ; j++ ) + for ( k = 0 ; k < cdim[2] ; k++ ) { + cid = cell_getid( cdim , i , j , k ); + if ( s->cells[cid].count == 0 ) + continue; + for ( ii = -1 ; ii < 2 ; ii++ ) { + iii = i + ii; + if ( !s->periodic && ( iii < 0 || iii >= cdim[0] ) ) + continue; + iii = ( iii + cdim[0] ) % cdim[0]; + for ( jj = -1 ; jj < 2 ; jj++ ) { + jjj = j + jj; + if ( !s->periodic && ( jjj < 0 || jjj >= cdim[1] ) ) + continue; + jjj = ( jjj + cdim[1] ) % cdim[1]; + for ( kk = -1 ; kk < 2 ; kk++ ) { + kkk = k + kk; + if ( !s->periodic && ( kkk < 0 || kkk >= cdim[2] ) ) + continue; + kkk = ( kkk + cdim[2] ) % cdim[2]; + cjd = cell_getid( cdim , iii , jjj , kkk ); + if ( s->cells[cjd].count == 0 ) + continue; + if ( cid >= cjd ) + continue; + t = space_addtask( s , tid_pair , 0 , 0 , &s->cells[cid] , &s->cells[cjd] , NULL , 0 , NULL , 0 ); + task_addunlock( s->cells[cid].sorts[ sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ] ] , t ); + task_addunlock( s->cells[cjd].sorts[ sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ] ] , t ); + s->cells[cid].nr_pairs += 1; + s->cells[cjd].nr_pairs += 1; + } + } + } + } + + /* Split the tasks. */ + space_splittasks( s ); + + /* Did we already create indices? */ + if ( s->tasks_ind == NULL ) + if ( ( s->tasks_ind = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL ) + error( "Failed to allocate task indices." ); + + /* Did the number of tasks change, i.e. do we have to re-index? */ + if ( nr_tasks_old != s->nr_tasks ) + for ( k = 0 ; k < s->nr_tasks ; k++ ) + s->tasks_ind[k] = k; + + /* Remove sort tasks with no dependencies. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + t = &s->tasks[k]; + if ( t->type == tid_sort && t->nr_unlock_tasks == 0 ) { + t->type = tid_none; + if ( t->ci->split ) + for ( j = 0 ; j < 8 ; j++ ) + if ( t->ci->progeny[j] != NULL && t->flags & ( 1 << j ) ) + task_rmunlock( t->ci->progeny[j]->sorts[j] , t ); + } + } + + /* Count the number of each task type. */ + for ( k = 0 ; k < tid_count ; k++ ) + counts[k] = 0; + for ( k = 0 ; k < s->nr_tasks ; k++ ) + counts[ s->tasks[k].type ] += 1; + printf( "space_maketasks: task counts are [ %s=%i" , taskID_names[0] , counts[0] ); + for ( k = 1 ; k < tid_count ; k++ ) + printf( " %s=%i" , taskID_names[k] , counts[k] ); + printf( " ]\n" ); + + /* Re-set the next task pointer. */ + s->next_task = 0; + + } + + +/** + * @brief Sort the parts into eight bins along the given pivots. + * + * @param c The #cell array to be sorted. + */ + +void cell_split ( struct cell *c ) { + + int i, j, k; + struct part temp, *parts = c->parts; + int left[8], right[8]; + double pivot[3]; + + /* Init the pivot. */ + for ( k = 0 ; k < 3 ; k++ ) + pivot[k] = c->loc[k] + c->h[k]/2; + + /* Split along the x-axis. */ + i = 0; j = c->count - 1; + while ( i < j ) { + while ( i < c->count-1 && parts[i].x[0] <= pivot[0] ) + i += 1; + while ( j > 0 && parts[j].x[0] > pivot[0] ) + j -= 1; + if ( i < j ) { + temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; + } + } + left[1] = i; right[1] = c->count - 1; + left[0] = 0; right[0] = j; + + /* Split along the y axis, twice. */ + for ( k = 1 ; k >= 0 ; k-- ) { + i = left[k]; j = right[k]; + while ( i < j ) { + while ( i < right[k] && parts[i].x[1] <= pivot[1] ) + i += 1; + while ( j > left[k] && parts[j].x[1] > pivot[1] ) + j -= 1; + if ( i < j ) { + temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; + } + } + left[2*k+1] = i; right[2*k+1] = right[k]; + left[2*k] = left[k]; right[2*k] = j; + } + + /* Split along the z axis, four times. */ + for ( k = 3 ; k >= 0 ; k-- ) { + i = left[k]; j = right[k]; + while ( i < j ) { + while ( i < right[k] && parts[i].x[2] <= pivot[2] ) + i += 1; + while ( j > left[k] && parts[j].x[2] > pivot[2] ) + j -= 1; + if ( i < j ) { + temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; + } + } + left[2*k+1] = i; right[2*k+1] = right[k]; + left[2*k] = left[k]; right[2*k] = j; + } + + /* Store the counts and offsets. */ + for ( k = 0 ; k < 8 ; k++ ) { + c->progeny[k]->count = right[k] - left[k] + 1; + if ( c->progeny[k]->count < 0 ) + abort(); + c->progeny[k]->parts = &c->parts[ left[k] ]; + } + + /* Verify a few sub-cells. */ + /* for ( k = 0 ; k < c->progeny[0]->count ; k++ ) + if ( c->progeny[0]->parts[k].x[0] > pivot[0] || + c->progeny[0]->parts[k].x[1] > pivot[1] || + c->progeny[0]->parts[k].x[2] > pivot[2] ) + error( "Sorting failed (progeny=0)." ); + for ( k = 0 ; k < c->progeny[1]->count ; k++ ) + if ( c->progeny[1]->parts[k].x[0] > pivot[0] || + c->progeny[1]->parts[k].x[1] > pivot[1] || + c->progeny[1]->parts[k].x[2] <= pivot[2] ) + error( "Sorting failed (progeny=1)." ); + for ( k = 0 ; k < c->progeny[2]->count ; k++ ) + if ( c->progeny[2]->parts[k].x[0] > pivot[0] || + c->progeny[2]->parts[k].x[1] <= pivot[1] || + c->progeny[2]->parts[k].x[2] > pivot[2] ) + error( "Sorting failed (progeny=2)." ); */ + + } + + +/** + * @brief Split cells that contain too many particles. + * + * @param s The #space we are working in. + * @param c The #cell under consideration. + */ + +void space_split ( struct space *s , struct cell *c ) { + + int k, count; + double r, r_limit, r_max = 0.0; + struct cell *temp; + + /* Check the depth. */ + if ( c->depth > s->maxdepth ) + s->maxdepth = c->depth; + + /* Set the minimum cutoff. */ + r_limit = fmin( c->h[0] , fmin( c->h[1] , c->h[2] ) ) / 2; + + /* Count the particles below that. */ + for ( count = 0 , k = 0 ; k < c->count ; k++ ) { + r = c->parts[k].r; + if ( r <= r_limit ) + count += 1; + if ( r > r_max ) + r_max = r; + } + c->r_max = r_max; + + /* Split or let it be? */ + if ( count > c->count*space_splitratio && c->count > space_splitsize ) { + + /* No longer just a leaf. */ + c->split = 1; + + /* Create the cell's progeny. */ + for ( k = 0 ; k < 8 ; k++ ) { + temp = space_getcell( s ); + temp->count = 0; + temp->loc[0] = c->loc[0]; + temp->loc[1] = c->loc[1]; + temp->loc[2] = c->loc[2]; + temp->h[0] = c->h[0]/2; + temp->h[1] = c->h[1]/2; + temp->h[2] = c->h[2]/2; + if ( k & 4 ) + temp->loc[0] += temp->h[0]; + if ( k & 2 ) + temp->loc[1] += temp->h[1]; + if ( k & 1 ) + temp->loc[2] += temp->h[2]; + temp->depth = c->depth + 1; + temp->split = 0; + temp->r_max = 0.0; + temp->parent = c; + c->progeny[k] = temp; + } + + /* Split the cell data. */ + cell_split( c ); + + /* Recurse? */ + for ( k = 0 ; k < 8 ; k++ ) + space_split( s , c->progeny[k] ); + + /* Remove any progeny with zero parts. */ + for ( k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k]->count == 0 ) { + space_recycle( s , c->progeny[k] ); + c->progeny[k] = NULL; + } + + } + + /* Otherwise, set the progeny to null. */ + else { + bzero( c->progeny , sizeof(struct cell *) * 8 ); + c->split = 0; + } + + } + + +/** + * @brief Return a used cell to the cell buffer. + * + * @param s The #space. + * @param c The #cell. + */ + +void space_recycle ( struct space *s , struct cell *c ) { + + /* Clear the cell. */ + if ( lock_destroy( &c->lock ) != 0 ) + error( "Failed to destroy spinlock." ); + + /* Hook this cell into the buffer. */ + c->next = s->cells_new; + s->cells_new = c; + s->tot_cells -= 1; + + } + + +/** + * @brief Get a new empty cell. + * + * @param s The #space. + */ + +struct cell *space_getcell ( struct space *s ) { + + struct cell *c; + int k; + + /* Is the buffer empty? */ + if ( s->cells_new == NULL ) { + if ( posix_memalign( (void *)&s->cells_new , 64 , space_cellallocchunk * sizeof(struct cell) ) != 0 ) + error( "Failed to allocate more cells." ); + bzero( s->cells_new , space_cellallocchunk * sizeof(struct cell) ); + for ( k = 0 ; k < space_cellallocchunk-1 ; k++ ) + s->cells_new[k].next = &s->cells_new[k+1]; + s->cells_new[ space_cellallocchunk-1 ].next = NULL; + } + + /* Pick off the next cell. */ + c = s->cells_new; + s->cells_new = c->next; + s->tot_cells += 1; + + /* Init some things in the cell. */ + bzero( c , sizeof(struct cell) ); + if ( lock_init( &c->lock ) != 0 ) + error( "Failed to initialize cell spinlock." ); + + return c; + + } + + +/** + * @brief Split the space into cells given the array of particles. + * + * @param The #space to initialize. + * @param dim Spatial dimensions of the domain. + * @param parts Pointer to an array of #part. + * @param N The number of parts in the space. + * @param periodic flag whether the domain is periodic or not. + * + * Makes a grid of edge length > r_max and fills the particles + * into the respective cells. Cells containing more than #space_maxppc + * parts with a cutoff below half the cell width are then split + * recursively. + */ + + +void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ) { + + int i, j, k; + int nr_cells, cdim[3]; + double r_min, r_max, h[3], ih[3]; + struct cell *c, *cells; + struct part *parts_new, *finger; + + + /* Get the minimum and maximum cutoff radii. */ + r_min = parts[0].r; r_max = r_min; + for ( k = 1 ; k < N ; k++ ) + if ( parts[k].r < r_min ) + r_min = parts[k].r; + else if ( parts[k].r > r_max ) + r_max = parts[k].r; + + /* Get the cell width. */ + if ( h_max < r_max ) + h_max = r_max; + for ( k = 0 ; k < 3 ; k++ ) { + cdim[k] = ceil( dim[k] / h_max ); + h[k] = dim[k] / cdim[k]; + ih[k] = 1.0 / h[k]; + } + + /* Allocate the highest level of cells. */ + nr_cells = cdim[0] * cdim[1] * cdim[2]; + if ( posix_memalign( (void *)&cells , 64 , nr_cells * sizeof(struct cell) ) != 0 ) + error( "Failed to allocate cells." ); + bzero( cells , nr_cells * sizeof(struct cell) ); + for ( k = 0 ; k < nr_cells ; k++ ) + if ( lock_init( &cells[k].lock ) != 0 ) + error( "Failed to init spinlock." ); + + /* Set the cell locations. */ + for ( i = 0 ; i < cdim[0] ; i++ ) + for ( j = 0 ; j < cdim[1] ; j++ ) + for ( k = 0 ; k < cdim[2] ; k++ ) { + c = &cells[ cell_getid( cdim , i , j , k ) ]; + c->loc[0] = i*h[0]; c->loc[1] = j*h[1]; c->loc[2] = k*h[2]; + c->h[0] = h[0]; c->h[1] = h[1]; c->h[2] = h[2]; + } + + /* Run through the particles and get the counts for each cell. */ + for ( k = 0 ; k < N ; k++ ) + cells[ cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ].count += 1; + + /* Allocate the new part buffer and set the part pointers in each cell. */ + if ( posix_memalign( (void *)&parts_new , 64 , N * sizeof(struct part) ) != 0 ) + error( "Failed to allocate parts." ); + for ( finger = parts_new , k = 0 ; k < nr_cells ; k++ ) { + c = &cells[ k ]; + c->parts = finger; + finger = &finger[ c->count ]; + c->count = 0; + } + for ( k = 0 ; k < N ; k++ ) { + c = &cells[ cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ]; + c->parts[ c->count ] = parts[k]; + c->count += 1; + } + + /* Loop over the cells and split them. */ + /* for ( k = 0 ; k < nr_cells ; k++ ) + if ( cells[k].count > space_maxppc ) + space_split( s , &cells[k] ); */ + + /* Store eveything in the space. */ + s->r_min = r_min; s->r_max = r_max; + s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2]; + s->periodic = periodic; + s->parts = parts_new; + s->nr_parts = N; + s->h[0] = h[0]; s->h[1] = h[1]; s->h[2] = h[2]; + s->ih[0] = ih[0]; s->ih[1] = ih[1]; s->ih[2] = ih[2]; + s->cdim[0] = cdim[0]; s->cdim[1] = cdim[1]; s->cdim[2] = cdim[2]; + s->cells = cells; + s->nr_cells = nr_cells; + s->tot_cells = nr_cells; + if ( lock_init( &s->task_lock ) != 0 ) + error( "Failed to create task spin-lock." ); + + /* Loop over the cells and split them. */ + for ( k = 0 ; k < nr_cells ; k++ ) + space_split( s , &cells[k] ); + + } + diff --git a/space.h b/space.h new file mode 100644 index 0000000000000000000000000000000000000000..a75e5e7ebe1ffa9c65c48d121abe5f6ec5dc9df3 --- /dev/null +++ b/space.h @@ -0,0 +1,177 @@ + + +/* Some constants. */ +#define space_maxdepth 10 +#define space_cellallocchunk 1000 +#define space_splitratio 0.875 +#define space_splitsize 800 +#define task_maxwait 3 +#define task_maxunlock 39 + + +/* Map shift vector to sortlist. */ +extern const int sortlistID[27]; + + +/* The different task IDs. */ +enum taskIDs { + tid_none = 0, + tid_sort, + tid_self, + tid_pair, + tid_sub, + tid_count + }; +extern const char *taskID_names[]; + + +/* Data of a task. */ +struct task { + + int type, flags, wait, rank, done; + + int nr_unlock_tasks; + struct task *unlock_tasks[ task_maxunlock ]; + + int nr_unlock_cells; + struct cell *ci, *cj, *unlock_cells[2]; + + } __attribute__((aligned (64))); + + +/* Entry in a list of sorted indices. */ +struct entry { + float d; + int i; + }; + + +/* Data of a single particle. */ +struct part { + + /* Particle position. */ + double x[3]; + + /* Particle cutoff radius. */ + float r; + + /* Particle time-step. */ + float dt; + + /* Particle ID. */ + int id; + + /* Number of pairwise interactions. */ + float count; + int icount; + + } __attribute__((aligned (32))); + + +/* Structure to store the data of a single cell. */ +struct cell { + + /* The cell location on the grid. */ + double loc[3]; + + /* The cell dimensions. */ + double h[3]; + + /* Max radii in this cell. */ + double r_max; + + /* The depth of this cell in the tree. */ + int depth, split; + + /* Nr of parts. */ + int count; + + /* Pointers to the particle data. */ + struct part *parts; + + /* Pointers for the sorted indices. */ + struct entry *sort; + + /* Number of pairs associated with this cell. */ + int nr_pairs; + + /* Pointers to the next level of cells. */ + struct cell *progeny[8]; + + /* Parent cell. */ + struct cell *parent; + + /* The tasks computing this cell's sorts. */ + struct task *sorts[14]; + + /* Number of tasks this cell is waiting for and whether it is in use. */ + int wait; + + /* Is the data of this cell being used in a sub-cell? */ + int hold; + + /* Spin lock for various uses. */ + lock_type lock; + + /* Linking pointer for "memory management". */ + struct cell *next; + + } __attribute__((aligned (64))); + + +/* The space in which the cells reside. */ +struct space { + + /* Spatial extent. */ + double dim[3]; + + /* Cell widths. */ + double h[3], ih[3]; + + /* The minimum and maximum cutoff radii. */ + double r_min, r_max; + + /* Number of cells. */ + int nr_cells, tot_cells; + + /* Space dimensions in number of cells. */ + int maxdepth, cdim[3]; + + /* The (level 0) cells themselves. */ + struct cell *cells; + + /* Buffer of unused cells. */ + struct cell *cells_new; + + /* The particle data (cells have pointers to this). */ + struct part *parts; + + /* The sortlist data (cells hold pointers to these). */ + struct entry *sortlist; + + /* The total number of parts in the space. */ + int nr_parts; + + /* Is the space periodic? */ + int periodic; + + /* The list of tasks. */ + struct task *tasks; + int nr_tasks, next_task; + int *tasks_ind; + lock_type task_lock; + + }; + + +/* function prototypes. */ +struct cell *space_getcell ( struct space *s ); +struct task *space_gettask ( struct space *s ); +void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ); +void space_maketasks ( struct space *s , int do_sort ); +void space_map_cells ( struct space *s , void (*fun)( struct cell *c , void *data ) , void *data ); +void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data ); +void space_recycle ( struct space *s , struct cell *c ); + + + diff --git a/test.c b/test.c new file mode 100644 index 0000000000000000000000000000000000000000..f22ed69dae37300c64745eaf31645e2cce2e7dbc --- /dev/null +++ b/test.c @@ -0,0 +1,545 @@ + +/* Some standard headers. */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <pthread.h> +#include <math.h> +#include <omp.h> + +/* Local headers. */ +#define INLINE +#include "cycle.h" +#include "lock.h" +#include "space.h" +#include "runner.h" + +/* Ticks per second on this machine. */ +#ifndef CPU_TPS + #define CPU_TPS 2.67e9 +#endif + +/* Error macro. */ +#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); } + + +/** + * @brief Mapping function to draw a specific cell (gnuplot). + */ + +void map_cells_plot ( struct cell *c , void *data ) { + + int k, depth = *(int *)data; + double *l = c->loc, *h = c->h; + + if ( c->depth >= depth ) { + + printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2] ); + printf( "%.16e %.16e %.16e\n\n\n" , l[0] , l[1]+h[1] , l[2] ); + + printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n\n\n" , l[0] , l[1]+h[1] , l[2]+h[2] ); + + printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n\n\n" , l[0] , l[1] , l[2]+h[2] ); + + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n\n\n" , l[0]+h[0] , l[1] , l[2] +h[2]); + + printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n\n\n" , l[0]+h[0] , l[1] , l[2] ); + + printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2] ); + printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2]+h[2] ); + printf( "%.16e %.16e %.16e\n\n\n" , l[0]+h[0] , l[1]+h[1] , l[2] ); + + for ( k = 0 ; k < c->count ; k++ ) + printf( "%.16e %.16e %.16e %.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2] , + c->parts[k].x[0] , c->parts[k].x[1] , c->parts[k].x[2] ); + printf( "\n\n" ); + + } + + } + + +/** + * @brief Mapping function for checking if each part is in its box. + */ + +/* void map_check ( struct part *p , struct cell *c , void *data ) { + + if ( p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] || + p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] || + p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ) + printf( "map_check: particle %i is outside of its box.\n" , p->id ); + + } */ + + +/** + * @brief Mapping function for neighbour count. + */ + +void map_count ( struct part *p , struct cell *c , void *data ) { + + double *count = (double *)data; + + // printf( "%e\n" , p->count ); + + *count += p->count; + + } + + +/** + * @brief Mapping function for neighbour count. + */ + +void map_icount ( struct part *p , struct cell *c , void *data ) { + + int *count = (int *)data; + + // printf( "%i\n" , p->icount ); + + *count += p->icount; + + } + + +/** + * @brief Mapping function to print the particle position. + */ + +void map_dump ( struct part *p , struct cell *c , void *data ) { + + double *shift = (double *)data; + + printf( "%g\t%g\t%g\n" , p->x[0]-shift[0] , p->x[1]-shift[1] , p->x[2]-shift[2] ); + + } + + +/** + * @brief Read coordinates from a text file. + * + * @param fname The name of the coordinate file. + * @param parts An array of #part in which to store the coordinates. + * @param N The number of parts to read. + */ + +void read_coords ( char *fname , struct part *parts , int N ) { + + FILE *fd; + int k; + + /* Open the given file. */ + if ( ( fd = fopen( fname , "r" ) ) == NULL ) + error( "Failed to open coordinate file" ); + + /* Read the coordinates into the part positions. */ + for ( k = 0 ; k < N ; k++ ) { + if ( fscanf( fd , "%lf %lf %lf" , &parts[k].x[0] , &parts[k].x[1] , &parts[k].x[2] ) != 3 ) { + printf( "read_coords: failed to read %ith entry.\n" , k ); + error( "Error reading coordinate file." ); + } + } + + } + + +/** + * @brief Read cutoffs from a text file. + * + * @param fname The name of the cutoffs file. + * @param parts An array of #part in which to store the cutoffs. + * @param N The number of parts to read. + */ + +void read_cutoffs ( char *fname , struct part *parts , int N ) { + + FILE *fd; + int k; + + /* Open the given file. */ + if ( ( fd = fopen( fname , "r" ) ) == NULL ) + error( "Failed to open cutoff file" ); + + /* Read the coordinates into the part positions. */ + for ( k = 0 ; k < N ; k++ ) { + if ( fscanf( fd , "%ef" , &parts[k].r ) != 1 ) { + printf( "read_cutoffs: failed to read %ith entry.\n" , k ); + error( "Error reading cutoff file." ); + } + } + + } + + +/** + * @brief Read id from a text file. + * + * @param fname The name of the id file. + * @param parts An array of #part in which to store the dt. + * @param N The number of parts to read. + */ + +void read_id ( char *fname , struct part *parts , int N ) { + + FILE *fd; + int k; + + /* Open the given file. */ + if ( ( fd = fopen( fname , "r" ) ) == NULL ) + error( "Failed to open id file" ); + + /* Read the coordinates into the part positions. */ + for ( k = 0 ; k < N ; k++ ) { + if ( fscanf( fd , "%i" , &parts[k].id ) != 1 ) { + printf( "read_id: failed to read %ith entry.\n" , k ); + error( "Error reading id file." ); + } + } + + } + + +/** + * @brief Read dt from a text file. + * + * @param fname The name of the dt file. + * @param parts An array of #part in which to store the dt. + * @param N The number of parts to read. + */ + +void read_dt ( char *fname , struct part *parts , int N ) { + + FILE *fd; + int k; + + /* Open the given file. */ + if ( ( fd = fopen( fname , "r" ) ) == NULL ) + error( "Failed to open dt file" ); + + /* Read the coordinates into the part positions. */ + for ( k = 0 ; k < N ; k++ ) { + if ( fscanf( fd , "%ef" , &parts[k].dt ) != 1 ) + error( "Error reading dt file." ); + } + + } + + +/** + * @brief Compute the average number of pairs per particle using + * a brute-force O(N^2) computation. + * + * @param dim The space dimensions. + * @param parts The #part array. + * @param N The number of parts. + * @param periodic Periodic boundary conditions flag. + */ + +void pairs_n2 ( double *dim , struct part *__restrict__ parts , int N , int periodic ) { + + int i, j, k, count = 0, mj, mk; + double r, dx[3], dcount = 0.0, maxratio = 1.0; + float f1, f2; + + /* Loop over all particle pairs. */ + #pragma omp parallel for schedule(dynamic), default(none), private(k,i,dx,r,f1,f2), shared(maxratio,mj,mk,periodic,parts,dim,N,stdout), reduction(+:count,dcount) + for ( j = 0 ; j < N ; j++ ) { + if ( j % 1000 == 0 ) { + printf( "pairs_n2: j=%i, count=%i.\n" , j , count ); + fflush(stdout); + } + for ( k = j+1 ; k < N ; k++ ) { + for ( i = 0 ; i < 3 ; i++ ) { + dx[i] = parts[j].x[i] - parts[k].x[i]; + if ( periodic ) { + if ( dx[i] < -dim[i]/2 ) + dx[i] += dim[i]; + else if ( dx[i] > dim[i]/2 ) + dx[i] -= dim[i]; + } + } + r = sqrt( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ); + if ( r < parts[j].r || r < parts[k].r ) { + f1 = 0.0; f2 = 0.0; + iact( r*r , parts[j].r , parts[k].r , &f1 , &f2 , &count , &count ); + dcount += f1 + f2; + if ( parts[j].r / parts[k].r > maxratio ) + #pragma omp critical + { + maxratio = parts[j].r / parts[k].r; + mj = j; mk = k; + } + else if ( parts[k].r / parts[j].r > maxratio ) + #pragma omp critical + { + maxratio = parts[k].r / parts[j].r; + mj = j; mk = k; + } + } + } + } + + /* Dump the result. */ + printf( "pairs_n2: avg. nr. of pairs per part is %.3f (%.3f).\n" , ((double)count)/N , dcount/N + 32.0/3 ); + printf( "pairs_n2: maximum ratio between parts %i [%e,%e,%e] and %i [%e,%e,%e] is %.3f/%.3f\n" , + mj , parts[mj].x[0] , parts[mj].x[1] , parts[mj].x[2] , + mk , parts[mk].x[0] , parts[mk].x[1] , parts[mk].x[2] , + parts[mj].r , parts[mk].r ); fflush(stdout); + fflush(stdout); + + } + + +/** + * @brief Find the pairs of a single particle + * + * @param dim The space dimensions. + * @param parts The #part array. + * @param N The number of parts. + * @param periodic Periodic boundary conditions flag. + * @param target the index of the target particle. + */ + +void pairs_single ( double *dim , struct part *__restrict__ parts , int N , int periodic , int target ) { + + int i, k, tid; + double r, tx[3], tr, dx[3]; + + /* Get the target position and radius. */ + for ( k = 0 ; k < 3 ; k++ ) + tx[k] = parts[target].x[k]; + tr = parts[target].r; + tid = parts[target].id; + + /* Loop over all particle pairs. */ + #pragma omp parallel for schedule(dynamic), default(none), private(k,i,dx,r), shared(target,tx,tr,tid,periodic,parts,dim,N) + for ( k = 0 ; k < N ; k++ ) { + if ( k == target ) + continue; + for ( i = 0 ; i < 3 ; i++ ) { + dx[i] = tx[i] - parts[k].x[i]; + if ( periodic ) { + if ( dx[i] < -dim[i]/2 ) + dx[i] += dim[i]; + else if ( dx[i] > dim[i]/2 ) + dx[i] -= dim[i]; + } + } + r = sqrt( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ); + if ( r < tr ) + printf( "pairs_single: %i %i [%e,%e,%e] %e\n" , + tid , parts[k].id , dx[0] , dx[1] , dx[2] , r ); + } + + } + + +/** + * @brief Main routine that loads a few particles and generates some output. + * + */ + +int main ( int argc , char *argv[] ) { + + int c, icount, j, k, N = 100, periodic = 1, nr_threads = 1, nr_queues = -1, runs = 1; + double dim[3] = { 1.0 , 1.0 , 1.0 }, shift[3] = { 0.0 , 0.0 , 0.0 }; + double r_min = 0.01, r_max = 0.1, h_max = -1.0 , scaling = 1.0, count = 0.0; + struct part *parts = NULL; + struct space s; + struct runner r; + ticks tic; + + /* Init the space. */ + bzero( &s , sizeof(struct space) ); + + /* Parse the options. */ + while ( ( c = getopt( argc , argv , "a:b:p:d:N:c:h:v:m:s:t:q:r:i:m:" ) ) != -1 ) + switch ( c ) { + case 'N': + if ( sscanf( optarg , "%d" , &N ) != 1 ) + error( "Error parsing number of particles." ); + if ( posix_memalign( (void *)&parts , 16 , N * sizeof(struct part) ) != 0 ) + error( "Call to posix_memalign failed." ); + for ( k = 0 ; k < N ; k++ ) { + parts[k].x[0] = ((double)rand()) / RAND_MAX * dim[0]; + parts[k].x[1] = ((double)rand()) / RAND_MAX * dim[1]; + parts[k].x[2] = ((double)rand()) / RAND_MAX * dim[2]; + parts[k].r = r_min + ((r_max - r_min)*rand())/RAND_MAX; + } + printf( "main: allocated memory for %i parts.\n" , N ); fflush(stdout); + break; + case 'a': + if ( sscanf( optarg , "%lf" , &scaling ) != 1 ) + error( "Error parsing cutoff scaling." ); + printf( "main: scaling cutoff by %.3f.\n" , scaling ); fflush(stdout); + for ( k = 0 ; k < N ; k++ ) + parts[k].r *= scaling; + break; + case 'b': + if ( sscanf( optarg , "%lf %lf %lf" , &dim[0] , &dim[1] , &dim[2] ) != 3 ) + error( "Error parsing box dimensions." ); + break; + case 'c': + printf( "main: reading parts from %s...\n" , optarg ); fflush(stdout); + if ( parts == NULL && posix_memalign( (void *)&parts , 16 , N * sizeof(struct part) ) != 0 ) + error( "Call to calloc failed." ); + read_coords( optarg , parts , N ); + break; + case 'd': + printf( "main: reading dt from %s...\n" , optarg ); fflush(stdout); + read_dt( optarg , parts , N ); + break; + case 'h': + printf( "main: reading cutoffs from %s...\n" , optarg ); fflush(stdout); + read_cutoffs( optarg , parts , N ); + break; + case 'i': + printf( "main: reading ids from %s...\n" , optarg ); fflush(stdout); + read_id( optarg , parts , N ); + break; + case 'm': + if ( sscanf( optarg , "%lf" , &h_max ) != 1 ) + error( "Error parsing h_max." ); + printf( "main: maximum h set to %e.\n" , h_max ); + break; + case 'p': + if ( sscanf( optarg , "%d" , &periodic ) != 1 ) + error( "Error parsing periodicity." ); + printf( "main: periodicity switched %s.\n" , periodic ? "on" : "off" ); + break; + case 'q': + if ( sscanf( optarg , "%d" , &nr_queues ) != 1 ) + error( "Error parsing number of queues." ); + break; + case 'r': + if ( sscanf( optarg , "%d" , &runs ) != 1 ) + error( "Error parsing number of runs." ); + break; + case 's': + if ( sscanf( optarg , "%lf %lf %lf" , &shift[0] , &shift[1] , &shift[2] ) != 3 ) + error( "Error parsing shift." ); + for ( k = 0 ; k < N ; k++ ) { + parts[k].x[0] += shift[0]; + parts[k].x[1] += shift[1]; + parts[k].x[2] += shift[2]; + } + printf( "main: shifted parts by [ %.3f %.3f %.3f ].\n" , shift[0] , shift[1] , shift[2] ); + break; + case 't': + if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) + error( "Error parsing number of threads." ); + omp_set_num_threads( nr_threads ); + break; + case '?': + error( "Unknown option." ); + break; + } + + /* Get the brute-force number of pairs. */ + // pairs_n2( dim , parts , N , periodic ); + // pairs_single( dim , parts , N , periodic , 63628 ); + fflush( stdout ); + + /* Set default number of queues. */ + if ( nr_queues < 0 ) + nr_queues = nr_threads; + + /* Initialize the space with this data. */ + tic = getticks(); + space_init( &s , dim , parts , N , periodic , h_max ); + printf( "main: space_init took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout); + + /* Say a few nice things about the space we just created. */ + printf( "main: space dimensions are [ %.3f %.3f %.3f ].\n" , s.dim[0] , s.dim[1] , s.dim[2] ); + printf( "main: space %s periodic.\n" , s.periodic ? "is" : "isn't" ); + printf( "main: highest-level cell dimensions are [ %i %i %i ].\n" , s.cdim[0] , s.cdim[1] , s.cdim[2] ); + printf( "main: %i parts in %i cells.\n" , s.nr_parts , s.tot_cells ); + printf( "main: maximum depth is %d.\n" , s.maxdepth ); + printf( "main: cutoffs in [ %.3f %.3f ].\n" , s.r_min , s.r_max ); fflush(stdout); + + /* Dump the particle positions. */ + // space_map_parts( &s , &map_dump , shift ); + + /* Generate the tasks. */ + tic = getticks(); + space_maketasks( &s , 1 ); + printf( "main: generated %i tasks.\n" , s.nr_tasks ); + printf( "main: space_maketasks took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout); + + /* Initialize the runner with this space. */ + tic = getticks(); + runner_init( &r , &s , nr_threads , nr_queues , runner_policy_steal | runner_policy_keep ); + printf( "main: runner_init took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout); + + /* Let loose a runner on the space. */ + for ( j = 0 ; j < runs ; j++ ) { + printf( "main: starting run %i/%i with %i threads and %i queues...\n" , j+1 , runs , r.nr_threads , r.nr_queues ); fflush(stdout); + tic = getticks(); + #ifdef TIMER + for ( k = 0 ; k < runner_timer_count ; k++ ) + runner_timer[k] = 0; + #endif + #ifdef COUNTER + for ( k = 0 ; k < runner_counter_count ; k++ ) + runner_counter[k] = 0; + #endif + runner_run( &r , 0 ); + #ifdef TIMER + printf( "main: runner timers are [ %.3f" , runner_timer[0]/CPU_TPS*1000 ); + for ( k = 1 ; k < runner_timer_count ; k++ ) + printf( " %.3f" , ((double)runner_timer[k])/CPU_TPS*1000 ); + printf( " %.3f ] ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); + #else + printf( "main: runner_run with %i threads took %.3f ms.\n" , nr_threads , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); + #endif + #ifdef COUNTER + printf( "main: runner counters are [ %d" , runner_counter[0] ); + for ( k = 1 ; k < runner_counter_count ; k++ ) + printf( " %d" , runner_counter[k] ); + printf( " ].\n" ); + #endif + printf( "main: runner queue lengths are [ %i" , r.queues[0].count ); + for ( k = 1 ; k < r.nr_queues ; k++ ) + printf( " %i" , r.queues[k].count ); + printf( " ].\n" ); + fflush(stdout); + } + + /* Get the average interactions per particle. */ + count = 0; + space_map_parts( &s , &map_count , &count ); + printf( "main: average interactions per particle is %.3f.\n" , count / s.nr_parts + 32.0/3 ); + + /* Get the average interactions per particle. */ + icount = 0; + space_map_parts( &s , &map_icount , &icount ); + printf( "main: average neighbours per particle is %.3f.\n" , (double)icount / s.nr_parts ); + + /* Get all the cells of a certain depth. */ + /* count = 11; + space_map_cells( &s , &map_cells_plot , &count ); */ + + /* Check for outliers. */ + // space_map_parts( &s , &map_check , NULL ); + + /* All is calm, all is bright. */ + return 0; + + } diff --git a/test_mc.sh b/test_mc.sh new file mode 100755 index 0000000000000000000000000000000000000000..11da8c39de578738b59ce152ce1102d5227e3cb8 --- /dev/null +++ b/test_mc.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Set the default OpenMP behaviour +export OMP_WAIT_POLICY=PASSIVE + +# Clear up the memory first +# ./memhog `free -g | grep Mem | awk '{print int(0.9*$2)}'` + +# loop over number of CPUs +for cpu in {1..32} +do + + # Set some environment variables for OpenMP + export OMP_NUM_THREADS=$cpu + export OMP_THREAD_LIMIT=$cpu + export OMP_PROC_BIND=TRUE + + # ./test -r 100 -t $cpu -q $cpu -b "25000 25000 25000" -N 149646 -c data/Coordinates.txt -s "12500 12500 12500" -h data/SmoothingLength.txt > test_${cpu}.dump + + ./test -r 100 -t $cpu -q $cpu -b "100 100 100" -N 3664514 -c data2/Coordinates.txt -h data2/SmoothingLength.txt > test2_${cpu}.dump + + # ./test -r 100 -t $cpu -q $cpu -b "140 140 140" -N 7741 -c shrink/Coordinates.txt -s "70 70 70" -h shrink/SmoothingLength.txt > shrink_${cpu}.dump + +done