diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..287b44282f88b500e77d201b9a3f3676688e5522
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,33 @@
+
+CC=gcc
+FC=gfortran
+
+OPTS=-DTIMER -DCOUNTER -DCPU_TPS=2.67e9
+CFLAGS=-O3 -g -std=gnu99 -Wall -Werror -march=native -mtune=native -ffast-math -fomit-frame-pointer -malign-double -fstrict-aliasing -fopenmp
+# CFLAGS=-O0 -g -std=gnu99 -Wall -Werror -fopenmp
+LDFLAGS=-lm -lpthread -fopenmp
+
+FFLAGS=$(CFLAGS)
+
+OBJS=space.o runner.o test.o
+
+default: all
+
+depfile: $(wildcard *.c) $(wildcard *.h)
+	$(CC) -MM *.c *.h > depfile
+    
+include depfile
+
+%.o: Makefile
+
+%.o: %.c Makefile
+	$(CC) -c $(CFLAGS) $(OPTS) $< -o $@
+# 	$(CC) -c $(CFLAGS) $(OPTS) $< -dA -S
+
+test: $(OBJS)
+	gcc $^ $(LDFLAGS) -o test
+
+all: test 
+
+clean:
+	rm -f $(OBJS) test
diff --git a/Opteron6174/test2.dump b/Opteron6174/test2.dump
new file mode 100644
index 0000000000000000000000000000000000000000..9f4b5ca55bbc9f86aafdf2e4e94293a99c57a1c7
--- /dev/null
+++ b/Opteron6174/test2.dump
@@ -0,0 +1,48 @@
+4725.400 5722.778 8400.450 3194.839 120.059 105.812 56.976 5.655 0.000 0.000 19072.309 0.000 0.000 
+4919.301 5755.781 8710.699 3294.436 145.669 130.172 68.079 5.951 0.989 1.456 9838.910 0.000 0.000 
+4986.648 5794.996 8912.560 3360.085 198.966 178.689 108.065 6.243 5.700 7.087 6702.370 0.000 0.000 
+5259.090 5782.343 9050.539 3391.693 206.106 186.435 105.906 6.455 5.320 5.107 5129.911 0.000 0.000 
+5069.935 5801.673 9141.677 3427.429 254.263 228.474 140.110 6.954 12.026 13.356 4109.165 0.000 0.000 
+5144.622 5772.367 9238.842 3460.168 240.814 217.191 127.767 6.964 10.301 10.270 3450.979 0.000 0.000 
+5154.364 5800.711 9217.054 3457.825 310.817 271.857 166.828 7.499 31.194 33.498 2972.789 0.000 0.000 
+5134.256 5778.530 9162.365 3433.527 274.115 242.285 143.293 7.479 19.334 19.719 2583.830 0.000 0.000 
+5157.989 5803.972 9253.761 3458.642 345.791 303.071 185.356 8.256 38.970 41.899 2332.102 0.000 0.000 
+5238.888 5784.603 9245.546 3459.337 304.001 263.466 158.093 8.064 26.586 26.403 2093.112 0.000 0.000 
+5194.632 5806.809 9286.337 3476.360 386.899 326.131 197.683 8.884 60.838 64.860 1917.308 0.000 0.000 
+5177.886 5780.655 9305.192 3478.527 351.232 291.278 170.044 8.479 55.968 57.658 1752.747 0.000 0.000 
+5282.438 5814.123 9370.344 3487.279 439.419 354.893 209.715 9.268 90.429 96.766 1643.057 0.000 0.000 
+5282.634 5785.133 9380.112 3494.921 368.999 309.481 184.850 8.954 50.828 50.418 1519.213 0.000 0.000 
+5299.171 5814.226 9353.443 3493.089 501.669 374.055 221.973 9.862 132.895 138.850 1431.006 0.000 0.000 
+5353.611 5783.394 9414.077 3502.090 404.104 325.553 197.225 9.566 67.871 67.871 1348.012 0.000 0.000 
+5348.965 5817.267 9426.694 3506.508 490.604 375.330 226.972 10.119 110.522 113.377 1271.295 0.000 0.000 
+5482.762 5794.389 9596.688 3515.471 456.408 346.792 208.519 9.985 99.973 99.417 1214.456 0.000 0.000 
+5510.358 5821.199 9434.239 3515.476 541.956 392.489 238.131 10.826 137.361 139.270 1151.907 0.000 0.000 
+5473.950 5784.403 9555.850 3524.260 521.190 362.134 215.772 10.350 151.132 151.251 1095.725 0.000 0.000 
+5451.099 5823.864 9470.511 3526.494 604.650 407.376 247.352 11.149 187.478 191.021 1046.104 0.000 0.000 
+5505.379 5797.034 9522.470 3532.581 525.123 382.731 230.351 10.745 135.603 132.723 998.445 0.000 0.000 
+5477.417 5828.421 9518.820 3535.798 656.047 441.104 265.603 12.163 204.144 206.721 962.972 0.000 0.000 
+5583.622 5806.082 9566.668 3544.756 557.810 391.739 239.332 11.165 152.257 148.435 923.460 0.000 0.000 
+5515.739 5836.443 9570.943 3550.382 748.315 463.250 275.140 12.299 276.417 278.800 895.218 0.000 0.000 
+5587.893 5817.037 9645.617 3558.896 595.736 409.485 250.071 11.453 173.158 168.782 859.240 0.000 0.000 
+5542.670 5842.593 9748.742 3572.753 770.110 473.833 286.208 12.795 276.521 276.787 839.777 0.000 0.000 
+5701.095 5822.677 9928.963 3604.407 651.649 436.940 264.515 12.132 205.074 198.781 816.121 0.000 0.000 
+5657.524 5849.707 9725.956 3586.328 773.924 487.765 294.339 13.456 254.027 251.461 786.797 0.000 0.000 
+5777.192 5822.372 9935.261 3617.644 743.599 456.303 276.207 12.457 279.953 272.829 768.997 0.000 0.000 
+5660.033 5851.597 9713.186 3599.779 820.166 493.151 295.179 13.596 301.961 298.833 738.352 0.000 0.000 
+5640.843 5807.036 9798.463 3591.253 629.146 351.585 180.046 14.269 265.955 256.574 709.472 0.000 0.000 
+5714.651 5851.018 9832.902 3614.952 772.222 401.663 197.861 17.038 337.647 332.804 695.977 0.000 0.000 
+5724.058 5807.343 9806.919 3602.480 656.785 355.842 183.545 14.471 288.564 277.148 668.951 0.000 0.000 
+5681.948 5852.792 9813.468 3623.392 877.910 416.657 201.330 17.454 430.776 425.555 669.068 0.000 0.000 
+6265.551 5826.791 9949.478 3645.344 1030.717 661.847 208.148 17.762 568.915 424.458 663.718 0.000 0.000 
+5763.804 5862.045 10168.989 3683.041 966.853 442.515 222.032 19.469 485.971 479.159 650.065 0.000 0.000 
+5932.583 5822.891 10004.099 3656.150 763.985 405.568 205.704 17.345 343.667 327.326 624.866 0.000 0.000 
+5890.683 5882.241 10203.252 3737.363 979.443 468.428 233.404 21.593 474.652 465.229 613.284 0.000 0.000 
+5907.127 5828.063 9983.026 3645.104 777.450 422.543 212.224 18.077 338.502 320.211 593.760 0.000 0.000 
+5973.892 5878.569 10414.183 3776.429 988.204 502.585 245.691 23.177 452.688 437.958 593.734 0.000 0.000 
+5999.062 5843.136 10431.820 3776.043 886.512 466.573 244.742 21.303 401.304 379.597 584.426 0.000 0.000 
+5802.772 5879.462 10255.706 3782.129 1131.678 511.443 243.752 23.627 582.060 569.901 569.950 0.000 0.000 
+6040.364 5838.864 10172.545 3704.922 927.791 486.075 238.959 21.438 423.612 400.392 545.770 0.000 0.000 
+5974.247 5889.443 10785.004 3906.344 1201.819 582.030 286.344 28.453 581.319 563.692 565.256 0.000 0.000 
+6185.360 5871.308 10675.076 3862.395 1044.241 524.659 278.511 25.544 497.958 468.343 551.707 0.000 0.000 
+6098.932 5901.716 10787.633 3909.595 1242.428 582.900 286.725 28.737 615.565 592.976 536.952 0.000 0.000 
+6237.389 5863.149 10836.057 3899.096 1144.514 553.592 292.793 27.093 571.810 539.514 527.113 0.000 0.000 
diff --git a/cycle.h b/cycle.h
new file mode 100644
index 0000000000000000000000000000000000000000..e357a017c79a6e9befb73e2b988bd23918b66f37
--- /dev/null
+++ b/cycle.h
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2003, 2007-8 Matteo Frigo
+ * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+#ifndef INLINE
+# if __GNUC__ && !__GNUC_STDC_INLINE__
+#  define INLINE extern inline
+# else
+#  define INLINE inline
+# endif
+#endif
+INLINE ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+#ifndef INLINE
+# if __GNUC__ && !__GNUC_STDC_INLINE__
+#  define INLINE extern inline
+# else
+#  define INLINE inline
+# endif
+#endif
+INLINE ticks getticks(void)
+{
+     unsigned a, d; 
+     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#if HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
+
diff --git a/data2/Density.txt.REMOVED.git-id b/data2/Density.txt.REMOVED.git-id
new file mode 100644
index 0000000000000000000000000000000000000000..ca18fbb99d20e3aab423b998ca98bb4151619fc2
--- /dev/null
+++ b/data2/Density.txt.REMOVED.git-id
@@ -0,0 +1 @@
+3afe2c7e0c425a30cdd406cfb93f305b5a55dafd
\ No newline at end of file
diff --git a/data2/Mass.txt.REMOVED.git-id b/data2/Mass.txt.REMOVED.git-id
new file mode 100644
index 0000000000000000000000000000000000000000..707be23bb0529c1b7d55045b37959c7ab6363369
--- /dev/null
+++ b/data2/Mass.txt.REMOVED.git-id
@@ -0,0 +1 @@
+98c93e9342eca73ce75144010612d4e0d47e3650
\ No newline at end of file
diff --git a/data2/SmoothingLength.txt.REMOVED.git-id b/data2/SmoothingLength.txt.REMOVED.git-id
new file mode 100644
index 0000000000000000000000000000000000000000..f944fe4f647ff8d8421a07aa2c175ab5a7edb26f
--- /dev/null
+++ b/data2/SmoothingLength.txt.REMOVED.git-id
@@ -0,0 +1 @@
+d57fee6402e9e77800e477525643d943f172dcbc
\ No newline at end of file
diff --git a/lock.h b/lock.h
new file mode 100644
index 0000000000000000000000000000000000000000..4357ec0c9aa259f304267cb11aa02aaa316d4276
--- /dev/null
+++ b/lock.h
@@ -0,0 +1,29 @@
+
+/* Get the inlining right. */
+#ifndef INLINE
+# if __GNUC__ && !__GNUC_STDC_INLINE__
+#  define INLINE extern inline
+# else
+#  define INLINE inline
+# endif
+#endif
+    
+#ifdef PTHREAD_LOCK
+    #define lock_type pthread_spinlock_t
+    #define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 )
+    #define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 )
+    #define lock_lock( l ) ( pthread_spin_lock( l ) != 0 )
+    #define lock_trylock( l ) ( pthread_spin_lock( l ) != 0 )
+    #define lock_unlock( l ) ( pthread_spin_unlock( l ) != 0 )
+#else
+    #define lock_type volatile int
+    #define lock_init( l ) ( *l = 0 )
+    #define lock_destroy( l ) 0
+    INLINE int lock_lock ( volatile int *l ) {
+        while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 )
+            while( *l );
+        return 0;
+        }
+    #define lock_trylock( l ) ( ( *(l) ) ? 1 : __sync_val_compare_and_swap( l , 0 , 1 ) )
+    #define lock_unlock( l ) ( *l = 0 )
+#endif
diff --git a/mkplots.m b/mkplots.m
new file mode 100644
index 0000000000000000000000000000000000000000..2118fe183e0ed66f337554e0d02bac633f88b07b
--- /dev/null
+++ b/mkplots.m
@@ -0,0 +1,100 @@
+
+%% Scaling and efficiency plots for the first test
+test = importdata( 'test_nosort.totals');
+ncores = size( test , 1 );
+
+clf
+subplot('position',[ 0.05 , 0.1 , 0.4 , 0.8 ]);
+plot( 1:ncores , test(1,10) ./ test(:,10) , '-k' , 'LineWidth' , 2 ); hold on;
+xlabel('nr. cores');
+plot( [1,ncores] , [1,ncores] , ':k' , 'LineWidth' , 1.4 );
+hold off;
+title('Speedup');
+axis([ 1 , ncores , 0 , ncores ]);
+
+subplot('position',[ 0.52 0.1 , 0.4 , 0.8 ]);
+plot( 1:ncores , test(1,10) ./ test(:,10) ./ (1:ncores)' , '-k' , 'LineWidth' , 2 ); hold on;
+plot( [1,ncores] , [1,1] , ':k' , 'LineWidth' , 1.4 );
+text(4*ncores/5,1,sprintf('%.2f',min(test(:,10))),'BackgroundColor',[1,1,1],'FontSize',12);
+xlabel('nr. cores');
+hold off;
+title('Efficiency');
+axis([ 1 , ncores , 0 , 1.2 ]);
+
+% Print this plot
+set( gcf , 'PaperSize' , 2.3*[ 8.5 4.5 ] );
+set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 8 4 ] );
+print -depsc2 test.eps
+!epstopdf test.eps 
+
+
+%% Scaling and efficiency plots for the second test
+test2 = importdata( 'test2.totals');
+ncores = size( test2 , 1 );
+
+clf
+subplot('position',[ 0.05 , 0.1 , 0.4 , 0.8 ]);
+plot( 1:ncores , test2(1,10) ./ test2(:,10) , '-k' , 'LineWidth' , 2 ); hold on;
+xlabel('nr. cores');
+plot( [1,ncores] , [1,ncores] , ':k' , 'LineWidth' , 1.4 );
+hold off;
+title('Speedup');
+axis([ 1 , ncores , 0 , ncores ]);
+
+subplot('position',[ 0.52 0.1 , 0.4 , 0.8 ]);
+plot( 1:ncores , test2(1,10) ./ test2(:,10) ./ (1:ncores)' , '-k' , 'LineWidth' , 2 ); hold on;
+plot( [1,ncores] , [1,1] , ':k' , 'LineWidth' , 1.4 );
+text(4*ncores/5,1,sprintf('%.2f',min(test2(:,10))),'BackgroundColor',[1,1,1],'FontSize',12);
+xlabel('nr. cores');
+hold off;
+title('Efficiency');
+axis([ 1 , ncores , 0 , 1.2 ]);
+
+% Print this plot
+set( gcf , 'PaperSize' , 2.3*[ 8.5 4.5 ] );
+set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 8 4 ] );
+print -depsc2 test2.eps
+!epstopdf test2.eps 
+
+
+%% Components of the first test
+test = importdata( 'test_nosort.totals');
+ncores = size( test , 1 );
+cols = [ 1 , 2 , 3 , 5 ];
+
+clf
+subplot('position',[ 0.1 , 0.1 , 0.8 , 0.8 ]);
+plot( 1:ncores , test(:,cols) , 'LineWidth' , 2 ); hold on;
+legend( 'sort' , 'self' , 'pair' , 'get task' , 'Location' , 'NorthWest' );
+xlabel('nr. cores');
+hold off;
+title('ms per task type');
+axis([ 1 , ncores , 0 , max(max(test(:,cols)))*1.1 ]);
+
+% Print this plot
+set( gcf , 'PaperSize' , 2.3*[ 4.5 4 ] );
+set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 4 4 ] );
+print -depsc2 parts.eps
+!epstopdf parts.eps 
+
+
+%% Components of the second test
+test2 = importdata( 'test2.totals');
+ncores = size( test2 , 1 );
+cols = [ 1 , 2 , 3 , 5 ];
+
+clf
+subplot('position',[ 0.1 , 0.1 , 0.8 , 0.8 ]);
+plot( 1:ncores , test2(:,cols) , 'LineWidth' , 2 ); hold on;
+legend( 'sort' , 'self' , 'pair' , 'get task' , 'Location' , 'NorthWest' );
+xlabel('nr. cores');
+hold off;
+title('ms per task type');
+axis([ 1 , ncores , 0 , max(max(test2(:,cols)))*1.1 ]);
+
+% Print this plot
+set( gcf , 'PaperSize' , 2.3*[ 4.5 4 ] );
+set( gcf , 'PaperPosition' , 2.3*[ 0.25 0.25 4 4 ] );
+print -depsc2 parts2.eps
+!epstopdf parts2.eps 
+
diff --git a/runner.c b/runner.c
new file mode 100644
index 0000000000000000000000000000000000000000..f6dea73e23069010a0bd6567189c7a1a1058e165
--- /dev/null
+++ b/runner.c
@@ -0,0 +1,1544 @@
+
+
+/* Some standard headers. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+
+/* Local headers. */
+#include "cycle.h"
+#include "lock.h"
+#include "space.h"
+#include "runner.h"
+
+/* Error macro. */
+#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); }
+
+/* Convert cell location to ID. */
+#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
+
+/* The timers. */
+ticks runner_timer[ runner_timer_count ];
+
+/* The counters. */
+int runner_counter[ runner_counter_count ];
+
+        
+
+const float runner_shift[13*3] = {
+     5.773502691896258e-01 ,  5.773502691896258e-01 ,  5.773502691896258e-01 ,
+     7.071067811865475e-01 ,  7.071067811865475e-01 ,  0.0                   ,
+     5.773502691896258e-01 ,  5.773502691896258e-01 , -5.773502691896258e-01 ,
+     7.071067811865475e-01 ,  0.0                   ,  7.071067811865475e-01 ,
+     1.0                   ,  0.0                   ,  0.0                   ,
+     7.071067811865475e-01 ,  0.0                   , -7.071067811865475e-01 ,
+     5.773502691896258e-01 , -5.773502691896258e-01 ,  5.773502691896258e-01 ,
+     7.071067811865475e-01 , -7.071067811865475e-01 ,  0.0                   ,
+     5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 ,
+     0.0                   ,  7.071067811865475e-01 ,  7.071067811865475e-01 ,
+     0.0                   ,  1.0                   ,  0.0                   ,
+     0.0                   ,  7.071067811865475e-01 , -7.071067811865475e-01 ,
+     0.0                   ,  0.0                   ,  1.0                   ,
+    };
+const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 ,
+                               0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; 
+
+
+/**
+ * @brief Sort the tasks IDs according to their weight and constraints.
+ *
+ * @param q The #queue.
+ */
+ 
+void queue_sort ( struct queue *q ) {
+
+    struct {
+        short int lo, hi;
+        } qstack[20];
+    int qpos, i, j, k, lo, hi, imin, temp;
+    int pivot_weight, pivot_wait;
+    int *weight, *wait;
+    int *data = q->tid;
+    struct task *t;
+        
+    /* Allocate and pre-compute each task's weight. */
+    if ( ( weight = (int *)alloca( sizeof(int) * q->count ) ) == NULL ||
+         ( wait = (int *)alloca( sizeof(int) * q->count ) ) == NULL )
+        error( "Failed to allocate weight buffer." );
+    for ( k = 0 ; k < q->count ; k++ ) {
+        t = &q->tasks[ q->tid[k] ];
+        switch ( t->type ) {
+            case tid_self:
+                wait[k] = t->rank;
+                weight[k] = 0; // t->ci->count * t->ci->count;
+                break;
+            case tid_pair:
+                wait[k] = t->rank;
+                weight[k] = 0; // t->ci->count * t->cj->count;
+                break;
+            case tid_sub:
+                wait[k] = t->rank;
+                weight[k] = 0; // (t->cj == NULL) ? t->ci->count * t->ci->count : t->ci->count * t->cj->count;
+                break;
+            case tid_sort:
+                wait[k] = t->rank;
+                weight[k] = 0; // t->ci->count;
+                break;
+            }
+        }
+        
+    /* Sort tasks. */
+    qstack[0].lo = 0; qstack[0].hi = q->count - 1; qpos = 0;
+    while ( qpos >= 0 ) {
+        lo = qstack[qpos].lo; hi = qstack[qpos].hi;
+        qpos -= 1;
+        if ( hi - lo < 15 ) {
+            for ( i = lo ; i < hi ; i++ ) {
+                imin = i;
+                for ( j = i+1 ; j <= hi ; j++ )
+                    if ( ( wait[ j ] < wait[ imin ] ) ||
+                         ( wait[ j ] == wait[ imin ] && weight[ j ] > weight[ imin ] ) )
+                if ( imin != i ) {
+                    temp = data[imin]; data[imin] = data[i]; data[i] = temp;
+                    temp = wait[imin]; wait[imin] = wait[i]; wait[i] = temp;
+                    temp = weight[imin]; weight[imin] = weight[i]; weight[i] = temp;
+                    }
+                }
+            }
+        else {
+            pivot_weight = weight[ ( lo + hi ) / 2 ];
+            pivot_wait = wait[ ( lo + hi ) / 2 ];
+            i = lo; j = hi;
+            while ( i <= j ) {
+                while ( ( wait[ i ] < pivot_wait ) ||
+                        ( wait[ i ] == pivot_wait && weight[ i ] > pivot_weight ) )
+                    i++;
+                while ( ( wait[ j ] > pivot_wait ) ||
+                        ( wait[ j ] == pivot_wait && weight[ j ] < pivot_weight ) )
+                    j--;
+                if ( i <= j ) {
+                    if ( i < j ) {
+                        temp = data[i]; data[i] = data[j]; data[j] = temp;
+                        temp = wait[i]; wait[i] = wait[j]; wait[j] = temp;
+                        temp = weight[i]; weight[i] = weight[j]; weight[j] = temp;
+                        }
+                    i += 1; j -= 1;
+                    }
+                }
+            if ( j > ( lo + hi ) / 2 ) {
+                if ( lo < j ) {
+                    qpos += 1;
+                    qstack[qpos].lo = lo;
+                    qstack[qpos].hi = j;
+                    }
+                if ( i < hi ) {
+                    qpos += 1;
+                    qstack[qpos].lo = i;
+                    qstack[qpos].hi = hi;
+                    }
+                }
+            else {
+                if ( i < hi ) {
+                    qpos += 1;
+                    qstack[qpos].lo = i;
+                    qstack[qpos].hi = hi;
+                    }
+                if ( lo < j ) {
+                    qpos += 1;
+                    qstack[qpos].lo = lo;
+                    qstack[qpos].hi = j;
+                    }
+                }
+            }
+        }
+                
+    }
+    
+    
+/** 
+ * @brief Sort the tasks in topological order over all queues.
+ *
+ * @param r The #runner.
+ */
+ 
+void runner_ranktasks ( struct runner *r ) {
+
+    int i, j = 0, k, temp, left = 0, rank;
+    struct task *t;
+    struct space *s = r->s;
+    int *tid;
+
+    /* Run throught the tasks and get all the waits right. */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
+            s->tasks[k].unlock_tasks[j]->wait += 1;
+        }
+        
+    /* Allocate and init the task-ID array. */
+    if ( ( tid = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL )
+        error( "Failed to allocate temporary tid array." );
+    for ( k = 0 ; k < s->nr_tasks ; k++ )
+        tid[k] = k;
+        
+    /* Main loop. */
+    for ( rank = 0 ; left < s->nr_tasks ; rank++ ) {
+        
+        /* Load the tids of tasks with no waits. */
+        for ( k = left ; k < s->nr_tasks ; k++ )
+            if ( s->tasks[ tid[k] ].wait == 0 ) {
+                temp = tid[j]; tid[j] = tid[k]; tid[k] = temp;
+                j += 1;
+                }
+
+        /* Traverse the task tree and add tasks with no weight. */
+        for ( i = left ; i < j ; i++ ) {
+            t = &s->tasks[ tid[i] ];
+            t->rank = rank;
+            s->tasks_ind[i] = t - s->tasks;
+            /* printf( "runner_ranktasks: task %i of type %s has rank %i.\n" , i , 
+                (t->type == tid_self) ? "self" : (t->type == tid_pair) ? "pair" : "sort" , rank ); */
+            for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
+                t->unlock_tasks[k]->wait -= 1;
+            }
+            
+        /* The new left (no, not tony). */
+        left = j;
+            
+        }
+        
+    /* Release the temporary array. */
+    free(tid);
+    
+    }
+    
+
+/**
+ * @brief Compute the interactions between a cell pair.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+ 
+void runner_dopair_naive ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
+
+    struct runner *r = rt->r;
+    int pid, pjd, k, count_i = ci->count, count_j = cj->count;
+    double shift[3] = { 0.0 , 0.0 , 0.0 };
+    struct part *pi, *pj, *parts_i = ci->parts, *parts_j = cj->parts;
+    double dx[3], pix[3], ri, ri2, r2;
+    TIMER_TIC
+    
+    /* Get the relative distance between the pairs, wrapping. */
+    for ( k = 0 ; k < 3 ; k++ ) {
+        if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 )
+            shift[k] = r->s->dim[k];
+        else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 )
+            shift[k] = -r->s->dim[k];
+        }
+        
+    /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
+        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
+        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout);
+    tic = getticks(); */
+    
+    /* Loop over the parts in ci. */
+    for ( pid = 0 ; pid < count_i ; pid++ ) {
+    
+        /* Get a hold of the ith part in ci. */
+        pi = &parts_i[ pid ];
+        for ( k = 0 ; k < 3 ; k++ )
+            pix[k] = pi->x[k] - shift[k];
+        ri = pi->r;
+        ri2 = ri * ri;
+        
+        /* Loop over the parts in cj. */
+        for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
+        
+            /* Get a pointer to the jth particle. */
+            pj = &parts_j[ pjd ];
+        
+            /* Compute the pairwise distance. */
+            r2 = 0.0;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pix[k] - pj->x[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            /* Hit or miss? */
+            if ( r2 < ri2 || r2 < pj->r*pj->r ) {
+            
+                iact( r2 , ri , pj->r , &pi->count , &pj->count , &pi->icount , &pj->icount );
+            
+                }
+        
+            } /* loop over the parts in cj. */
+    
+        } /* loop over the parts in ci. */
+        
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(runner_timer_dopair)) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(runner_timer_dopair);
+    #endif
+
+
+    }
+
+
+/**
+ * @brief Compute the interactions between a cell pair.
+ *
+ * @param r The #runner.
+ * @param ci The first #cell.
+ * @param cj The second #cell.
+ */
+ 
+void runner_dopair ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) {
+
+    struct runner *r = rt->r;
+    int pid, pjd, k, sid;
+    double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
+    struct cell *temp;
+    struct entry *sort_i, *sort_j;
+    struct part *pi, *pj, *parts_i, *parts_j;
+    double dx[3], pix[3], pjx[3], ri, ri2, rj, rj2, r2, di, dj;
+    double ri_max, rj_max, di_max, dj_min;
+    int count_i, count_j;
+    TIMER_TIC
+    
+    /* Get the relative distance between the pairs, wrapping. */
+    for ( k = 0 ; k < 3 ; k++ ) {
+        if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 )
+            shift[k] = r->s->dim[k];
+        else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 )
+            shift[k] = -r->s->dim[k];
+        }
+        
+    /* Get the sorting index. */
+    for ( sid = 0 , k = 0 ; k < 3 ; k++ )
+        sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 );
+
+    /* Switch the cells around? */
+    if ( runner_flip[sid] ) {
+        temp = ci; ci = cj; cj = temp;
+        for ( k = 0 ; k < 3 ; k++ )
+            shift[k] = -shift[k];
+        }
+    sid = sortlistID[sid];
+    
+    /* Get the cutoff shift. */
+    for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ )
+        rshift += shift[k]*runner_shift[ 3*sid + k ];
+        
+    /* printf( "runner_dopair: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
+        ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] ,
+        ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); */
+    /* for ( ri = 0 , k = 0 ; k < ci->count ; k++ )
+        ri += ci->parts[k].r;
+    for ( rj = 0 , k = 0 ; k < cj->count ; k++ )
+        rj += cj->parts[k].r;
+    printf( "runner_dopair: avg. radii %g/%g for h=%g at depth=%i.\n" , ri/ci->count , rj/cj->count , ci->h[0] , ci->depth ); fflush(stdout); */
+    
+    /* Pick-out the sorted lists. */
+    sort_i = &ci->sort[ sid*(ci->count + 1) ];
+    sort_j = &cj->sort[ sid*(cj->count + 1) ];
+    
+    /* Get some other useful values. */
+    ri_max = ci->r_max - rshift; rj_max = cj->r_max - rshift;
+    count_i = ci->count; count_j = cj->count;
+    parts_i = ci->parts; parts_j = cj->parts;
+    di_max = sort_i[count_i-1].d;
+    dj_min = sort_j[0].d;
+    
+    /* if ( ci->split && cj->split && sid == 4 )
+        printf( "boing!\n" ); */
+    
+    /* Loop over the parts in ci. */
+    for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + ri_max > dj_min ; pid-- ) {
+    
+        /* Get a hold of the ith part in ci. */
+        pi = &parts_i[ sort_i[ pid ].i ];
+        ri = pi->r - rshift;
+        di = sort_i[pid].d + ri;
+        if ( di < dj_min )
+            continue;
+            
+        ri2 = pi->r * pi->r;
+        for ( k = 0 ; k < 3 ; k++ )
+            pix[k] = pi->x[k] - shift[k];
+        
+        /* Loop over the parts in cj. */
+        for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) {
+        
+            /* Get a pointer to the jth particle. */
+            pj = &parts_j[ sort_j[pjd].i ];
+        
+            /* Compute the pairwise distance. */
+            r2 = 0.0;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pix[k] - pj->x[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            /* Hit or miss? */
+            if ( r2 < ri2 ) {
+            
+                iact( r2 , ri , pj->r , &pi->count , &pj->count , &pi->icount , &pj->icount );
+            
+                }
+        
+            } /* loop over the parts in cj. */
+    
+        } /* loop over the parts in ci. */
+        
+    /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 );
+    tic = getticks(); */
+
+    /* Loop over the parts in cj. */
+    for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - rj_max < di_max ; pjd++ ) {
+    
+        /* Get a hold of the jth part in cj. */
+        pj = &parts_j[ sort_j[ pjd ].i ];
+        rj = pj->r + rshift;
+        dj = sort_j[pjd].d - rj;
+        if ( dj > di_max )
+            continue;
+            
+        for ( k = 0 ; k < 3 ; k++ )
+            pjx[k] = pj->x[k] + shift[k];
+        rj2 = pj->r * pj->r;
+        
+        /* Loop over the parts in ci. */
+        for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) {
+        
+            /* Get a pointer to the jth particle. */
+            pi = &parts_i[ sort_i[pid].i ];
+            
+            /* Compute the pairwise distance. */
+            r2 = 0.0;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pi->x[k] - pjx[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            /* Hit or miss? */
+            if ( r2 < rj2 && r2 > pi->r*pi->r ) {
+            
+                iact( r2 , 0 , rj , NULL , &pj->count , NULL , &pj->icount );
+            
+                }
+        
+            } /* loop over the parts in cj. */
+    
+        } /* loop over the parts in ci. */
+
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(runner_timer_dopair))) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(runner_timer_dopair);
+    #endif
+
+    }
+
+
+/**
+ * @brief Compute the cell self-interaction.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+
+void runner_doself ( struct runner_thread *rt , struct cell *c ) {
+
+    int k, pid, pjd, count = c->count;
+    double pix[3], dx[3], ri, ri2, r2;
+    struct part *pi, *pj, *parts = c->parts;
+    TIMER_TIC
+    
+    if ( c->split )
+        error( "Split cell should not have self-interactions." );
+    
+    /* Loop over the particles in the cell. */
+    for ( pid = 0 ; pid < count ; pid++ ) {
+    
+        /* Get a pointer to the ith particle. */
+        pi = &parts[pid];
+    
+        /* Get the particle position and radius. */
+        for ( k = 0 ; k < 3 ; k++ )
+            pix[k] = pi->x[k];
+        ri = pi->r;
+        ri2 = ri * ri;
+            
+        /* Loop over the other particles .*/
+        for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
+        
+            /* Get a pointer to the jth particle. */
+            pj = &parts[pjd];
+        
+            /* Compute the pairwise distance. */
+            r2 = 0.0;
+            for ( k = 0 ; k < 3 ; k++ ) {
+                dx[k] = pix[k] - pj->x[k];
+                r2 += dx[k]*dx[k];
+                }
+                
+            /* Hit or miss? */
+            if ( r2 < ri2 || r2 < pj->r*pj->r ) {
+            
+                iact( r2 , ri , pj->r , &pi->count , &pj->count , &pi->icount , &pj->icount );
+            
+                }
+        
+            } /* loop over all other particles. */
+    
+        } /* loop over all particles. */
+
+    #ifdef TIMER_VERBOSE
+        printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(runner_timer_doself)) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(runner_timer_doself);
+    #endif
+
+    }
+
+
+/**
+ * @brief Compute grouped sub-cell interactions
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+
+void runner_dosub ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags ) {
+
+    int j, k;
+
+    TIMER_TIC
+    
+    /* Different types of flags. */
+    switch ( flags ) {
+    
+        /* Regular sub-cell interactions of a single cell. */
+        case 0:
+            for ( j = 0 ; j < 7 ; j++ )
+                for ( k = j + 1 ; k < 8 ; k++ )
+                    runner_dopair( rt , ci->progeny[j] , ci->progeny[k] );
+            break;
+            
+        case 1: /* (  1 ,  1 ,  0 ) */
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[1] );
+            break;
+    
+        case 3: /* (  1 ,  0 ,  1 ) */
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[2] );
+            break;
+                    
+        case 4: /* (  1 ,  0 ,  0 ) */
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[3] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[3] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[3] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[3] );
+            break;
+            
+        case 5: /* (  1 ,  0 , -1 ) */
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[3] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[3] );
+            break;
+                    
+        case 7: /* (  1 , -1 ,  0 ) */
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[4] , cj->progeny[3] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[3] );
+            break;
+                    
+        case 9: /* (  0 ,  1 ,  1 ) */
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[4] );
+            break;
+                    
+        case 10: /* (  0 ,  1 ,  0 ) */
+            runner_dopair( rt , ci->progeny[2] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[2] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[2] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[2] , cj->progeny[5] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[5] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[5] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[5] );
+            break;
+                    
+        case 11: /* (  0 ,  1 , -1 ) */
+            runner_dopair( rt , ci->progeny[2] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[2] , cj->progeny[5] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[1] );
+            runner_dopair( rt , ci->progeny[6] , cj->progeny[5] );
+            break;
+                    
+        case 12: /* (  0 ,  0 ,  1 ) */
+            runner_dopair( rt , ci->progeny[1] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[1] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[1] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[1] , cj->progeny[6] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[3] , cj->progeny[6] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[5] , cj->progeny[6] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[0] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[2] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[4] );
+            runner_dopair( rt , ci->progeny[7] , cj->progeny[6] );
+            break;
+                
+        }
+    
+
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(runner_timer_dosub)) / CPU_TPS * 1000 );
+    #else
+        TIMER_TOC(runner_timer_dosub);
+    #endif
+
+    }
+
+
+/**
+ * @brief Sort the entries in ascending order using QuickSort.
+ *
+ * @param sort The entries
+ * @param N The number of entries.
+ */
+ 
+void runner_dosort_ascending ( struct entry *sort , int N ) {
+
+    struct {
+        short int lo, hi;
+        } qstack[10];
+    int qpos, i, j, lo, hi, imin;
+    struct entry temp;
+    float pivot;
+        
+    /* Sort parts in cell_i in decreasing order with quicksort */
+    qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0;
+    while ( qpos >= 0 ) {
+        lo = qstack[qpos].lo; hi = qstack[qpos].hi;
+        qpos -= 1;
+        if ( hi - lo < 15 ) {
+            for ( i = lo ; i < hi ; i++ ) {
+                imin = i;
+                for ( j = i+1 ; j <= hi ; j++ )
+                    if ( sort[j].d < sort[imin].d )
+                        imin = j;
+                if ( imin != i ) {
+                    temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp;
+                    }
+                }
+            }
+        else {
+            pivot = sort[ ( lo + hi ) / 2 ].d;
+            i = lo; j = hi;
+            while ( i <= j ) {
+                while ( sort[i].d < pivot ) i++;
+                while ( sort[j].d > pivot ) j--;
+                if ( i <= j ) {
+                    if ( i < j ) {
+                        temp = sort[i]; sort[i] = sort[j]; sort[j] = temp;
+                        }
+                    i += 1; j -= 1;
+                    }
+                }
+            if ( j > ( lo + hi ) / 2 ) {
+                if ( lo < j ) {
+                    qpos += 1;
+                    qstack[qpos].lo = lo;
+                    qstack[qpos].hi = j;
+                    }
+                if ( i < hi ) {
+                    qpos += 1;
+                    qstack[qpos].lo = i;
+                    qstack[qpos].hi = hi;
+                    }
+                }
+            else {
+                if ( i < hi ) {
+                    qpos += 1;
+                    qstack[qpos].lo = i;
+                    qstack[qpos].hi = hi;
+                    }
+                if ( lo < j ) {
+                    qpos += 1;
+                    qstack[qpos].lo = lo;
+                    qstack[qpos].hi = j;
+                    }
+                }
+            }
+        }
+                
+    }
+    
+    
+/**
+ * @brief inline helper fuction to merge two entry arrays (forward).
+ *
+ * @param one the first array
+ * @param none the length of the first array
+ * @param two the second array
+ * @param ntwo the length of the second array
+ * @param dest the destination array.
+ */
+ 
+inline void merge_forward ( struct entry *__restrict__ one , int none , struct entry *__restrict__ two , int ntwo , struct entry *__restrict__ dest ) {
+
+    int i = 0, j = 0, k = 0;
+    
+    while ( j < none && k < ntwo )
+        if ( one[j].d < two[k].d )
+            dest[i++] = one[j++];
+        else
+            dest[i++] = two[k++];
+    if ( j == none )
+        for ( ; k < ntwo ; k++ )
+            dest[i++] = two[k];
+    else
+        for ( ; j < none ; j++ )
+            dest[i++] = one[j];
+
+    }
+    
+
+/**
+ * @brief inline helper fuction to merge two entry arrays (forward).
+ *
+ * @param one the first array
+ * @param none the length of the first array
+ * @param two the second array
+ * @param ntwo the length of the second array
+ * @param dest the destination array.
+ */
+ 
+inline void merge_backward ( struct entry *__restrict__ one , int none , struct entry *__restrict__ two , int ntwo , struct entry *__restrict__ dest ) {
+
+    int i = none + ntwo - 1, j = none - 1, k = ntwo - 1;
+    
+    while ( j >= 0 && k >= 0 )
+        if ( one[j].d > two[k].d )
+            dest[i--] = one[j--];
+        else
+            dest[i--] = two[k--];
+    if ( j < 0 )
+        for ( ; k >= 0 ; k-- )
+            dest[i--] = two[k];
+    else
+        for ( ; j >= 0 ; j-- )
+            dest[i--] = one[j];
+
+    }
+    
+
+/**
+ * @brief Sort the particles in the given cell along all cardinal directions.
+ *
+ * @param r The #runner.
+ * @param c The #cell.
+ */
+ 
+void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) {
+
+    struct entry *finger;
+    struct entry *fingers[8];
+    struct part *parts = c->parts;
+    int j, k, count = c->count;
+    int cone, ctwo;
+    int i, ind, off[8], inds[8], temp_i;
+    float shift[3];
+    float buff[8];
+    struct cell *temp_c;
+    TIMER_TIC
+    
+    /* Does this cell even need to be sorted? */
+    for ( temp_c = c ; temp_c != NULL && temp_c->nr_pairs == 0 ; temp_c = temp_c->parent );
+    if ( temp_c == NULL )
+        return;
+
+    /* start by allocating the entry arrays. */
+    if ( lock_lock( &c->lock ) != 0 )
+        error( "Failed to lock cell." );
+    if ( c->sort == NULL )
+        if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->count + 1) * 13 ) ) == NULL )
+            error( "Failed to allocate sort memory." );
+    if ( lock_unlock( &c->lock ) != 0 )
+        error( "Failed to unlock cell." );
+        
+    /* Does this cell have any progeny? */
+    if ( c->split ) {
+    
+        /* Loop over the 13 different sort arrays. */
+        for ( j = 0 ; j < 13 ; j++ ) {
+        
+            /* Has this sort array been flagged? */
+            if ( !( flags & (1 << j) ) )
+                continue;
+                
+            if ( 0 ) {
+            
+                /* Get a finger on the sorting array. */
+                finger = &c->sort[ j*(count + 1) ];
+                
+                /* Merge the two first sub-cells forward into this cell. */
+                cone = c->progeny[0]->count;
+                ctwo = c->progeny[1]->count;
+                merge_forward( &c->progeny[0]->sort[ j*(cone + 1) ] , cone ,
+                               &c->progeny[1]->sort[ j*(ctwo + 1) ] , ctwo ,
+                               finger );
+                               
+                /* Merge-in the remaining arrays, alternating forward and
+                   backward merges. */
+                for ( k = 2 ; k < 8 ; k++ ) {
+                    cone = cone + ctwo;
+                    ctwo = c->progeny[k]->count;
+                    if ( k & 1 )
+                        merge_forward( &finger[ count - cone ] , cone ,
+                                       &c->progeny[k]->sort[ j*(ctwo + 1) ] , ctwo ,
+                                       finger );
+                    else
+                        merge_backward( finger , cone ,
+                                        &c->progeny[k]->sort[ j*(ctwo + 1) ] , ctwo ,
+                                        &finger[ count - cone - ctwo ] );
+                    }
+                
+                }
+                
+            else {
+            
+                /* Init the particle index offsets. */
+                for ( off[0] = 0 , k = 1 ; k < 8 ; k++ )
+                    if ( c->progeny[k-1] != NULL )
+                        off[k] = off[k-1] + c->progeny[k-1]->count;
+                    else
+                        off[k] = off[k-1];
+
+                /* Init the entries and indices. */
+                for ( k = 0 ; k < 8 ; k++ ) {
+                    inds[k] = k;
+                    if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
+                        fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ];
+                        buff[k] = fingers[k]->d;
+                        off[k] = off[k];
+                        }
+                    else
+                        buff[k] = FLT_MAX;
+                    }
+
+                /* Sort the buffer. */
+                for ( i = 0 ; i < 7 ; i++ )
+                    for ( k = i+1 ; k < 8 ; k++ )
+                        if ( buff[ inds[k] ] < buff[ inds[i] ] ) {
+                            temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i;
+                            }
+
+                /* For each entry in the new sort list. */
+                finger = &c->sort[ j*(count + 1) ];
+                for ( ind = 0 ; ind < count ; ind++ ) {
+
+                    /* Copy the minimum into the new sort array. */
+                    finger[ind].d = buff[inds[0]];
+                    finger[ind].i = fingers[inds[0]]->i + off[inds[0]];
+
+                    /* Update the buffer. */
+                    fingers[inds[0]] += 1;
+                    buff[inds[0]] = fingers[inds[0]]->d;
+
+                    /* Find the smallest entry. */
+                    for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) {
+                        temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i;
+                        }
+
+                    } /* Merge. */
+                    
+                }
+            
+            /* Add a sentinel. */
+            c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX;
+            c->sort[ j*(c->count + 1) + c->count ].i = 0;
+            
+            } /* loop over sort arrays. */
+    
+        } /* progeny? */
+        
+    /* Otherwise, just sort. */
+    else {
+    
+        /* Loop over the different cell axes. */
+        for ( j = 0 ; j < 13 ; j++ ) {
+        
+            /* Has this sort array been flagged? */
+            if ( !( flags & (1 << j) ) )
+                continue;
+        
+            /* Get the shift vector. */
+            shift[0] = runner_shift[ 3*j + 0 ];
+            shift[1] = runner_shift[ 3*j + 1 ];
+            shift[2] = runner_shift[ 3*j + 2 ];
+            
+            /* Fill the sort array. */
+            finger = &c->sort[ j*(count + 1) ];
+            for ( k = 0 ; k < count ; k++ ) {
+                finger[k].i = k;
+                finger[k].d = parts[k].x[0]*shift[0] + parts[k].x[1]*shift[1] + parts[k].x[2]*shift[2];
+                }
+                
+            /* Add the sentinel. */
+            finger[ c->count ].d = FLT_MAX;
+            finger[ c->count ].i = 0;
+                
+            /* Sort descending. */
+            runner_dosort_ascending( finger , c->count );
+        
+            }
+            
+        }
+        
+    /* Verify the sorting. */
+    /* for ( j = 0 ; j < 13 ; j++ ) {
+        if ( !( flags & (1 << j) ) )
+            continue;
+        finger = &c->sort[ j*(c->count + 1) ];
+        for ( k = 1 ; k < c->count ; k++ ) {
+            if ( finger[k].d < finger[k-1].d )
+                error( "Sorting failed, ascending array." );
+            if ( finger[k].i >= c->count )
+                error( "Sorting failed, indices borked." );
+            }
+        } */
+
+    #ifdef TIMER_VERBOSE
+        printf( "runner_dosort[%02i]: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms.\n" ,
+            rt->id , c->count , c->depth ,
+            (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , 
+            ((double)TIMER_TOC(runner_timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout);
+    #else
+        TIMER_TOC(runner_timer_dosort);
+    #endif
+
+    }
+
+
+/**
+ * @brief Lock a cell and hold its parents.
+ *
+ * @param c The #cell.
+ */
+ 
+int cell_locktree( struct cell *c ) {
+
+    struct cell *finger, *finger2;
+    TIMER_TIC
+
+    /* First of all, try to lock this cell. */
+    if ( lock_trylock( &c->lock ) != 0 ) {
+        TIMER_TOC(runner_timer_tree);
+        return 1;
+        }
+        
+    /* Did somebody hold this cell in the meantime? */
+    if ( c->hold ) {
+        
+        /* Unlock this cell. */
+        if ( lock_unlock( &c->lock ) != 0 )
+            error( "Failed to unlock cell." );
+            
+        /* Admit defeat. */
+        TIMER_TOC(runner_timer_tree);
+        return 1;
+    
+        }
+        
+    /* Climb up the tree and lock/hold/unlock. */
+    for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) {
+    
+        /* Lock this cell. */
+        if ( lock_trylock( &finger->lock ) != 0 )
+            break;
+            
+        /* Increment the hold. */
+        __sync_fetch_and_add( &finger->hold , 1 );
+        
+        /* Unlock the cell. */
+        if ( lock_unlock( &finger->lock ) != 0 )
+            error( "Failed to unlock cell." );
+    
+        }
+        
+    /* If we reached the top of the tree, we're done. */
+    if ( finger == NULL ) {
+        TIMER_TOC(runner_timer_tree);
+        return 0;
+        }
+        
+    /* Otherwise, we hit a snag. */
+    else {
+    
+        /* Undo the holds up to finger. */
+        for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent )
+            __sync_fetch_and_sub( &finger2->hold , 1 );
+            
+        /* Unlock this cell. */
+        if ( lock_unlock( &c->lock ) != 0 )
+            error( "Failed to unlock cell." );
+            
+        /* Admit defeat. */
+        TIMER_TOC(runner_timer_tree);
+        return 1;
+    
+        }
+
+    }
+    
+    
+/**
+ * @brief Unock a cell's parents.
+ *
+ * @param c The #cell.
+ */
+ 
+void cell_unlocktree( struct cell *c ) {
+
+    struct cell *finger;
+    TIMER_TIC
+
+    /* First of all, try to unlock this cell. */
+    if ( lock_unlock( &c->lock ) != 0 )
+        error( "Failed to unlock cell." );
+        
+    /* Climb up the tree and unhold the parents. */
+    for ( finger = c->parent ; finger != NULL ; finger = finger->parent )
+        __sync_fetch_and_sub( &finger->hold , 1 );
+        
+    TIMER_TOC(runner_timer_tree);
+        
+    }
+    
+    
+/**
+ * @brief Implements a barrier for the #runner threads.
+ *
+ */
+ 
+void runner_barrier( struct runner *r ) {
+
+    /* First, get the barrier mutex. */
+    if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 )
+        error( "Failed to get barrier mutex." );
+        
+    /* Wait for the barrier to close. */
+    while ( r->barrier_count < 0 )
+        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
+            error( "Eror waiting for barrier to close." );
+        
+    /* Once I'm in, increase the barrier count. */
+    r->barrier_count += 1;
+    
+    /* If all threads are in, send a signal... */
+    if ( r->barrier_count == r->nr_threads )
+        if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 )
+            error( "Failed to broadcast barrier full condition." );
+        
+    /* Wait for barrier to be released. */
+    while ( r->barrier_count > 0 )
+        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
+            error( "Error waiting for barrier to be released." );
+            
+    /* Decrease the counter before leaving... */
+    r->barrier_count += 1;
+    
+    /* If I'm the last one out, signal the condition again. */
+    if ( r->barrier_count == 0 )
+        if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 )
+            error( "Failed to broadcast empty barrier condition." );
+            
+    /* Last but not least, release the mutex. */
+    if ( pthread_mutex_unlock( &r->barrier_mutex ) != 0 )
+        error( "Failed to get unlock the barrier mutex." );
+
+    }
+    
+    
+/**
+ * @brief The #runner main thread routine.
+ *
+ * @param data A pointer to this thread's data.
+ */
+ 
+void *runner_main ( void *data ) {
+
+    struct runner_thread *rt = (struct runner_thread *)data;
+    struct runner *r = rt->r;
+    struct space *s = r->s;
+    int threadID = rt->id;
+    int k, qid, naq, keep, tpq;
+    struct queue *queues[ r->nr_queues ], *myq;
+    struct task *t;
+    struct cell *ci, *cj;
+    #ifdef TIMER
+        ticks stalled;
+    #endif
+    
+    /* Main loop. */
+    while ( 1 ) {
+    
+        /* Wait at the barrier. */
+        runner_barrier( r );
+        
+        /* Set some convenient local data. */
+        keep = r->policy & runner_policy_keep;
+        myq = &r->queues[ threadID % r->nr_queues ];
+        tpq = ceil( ((double)r->nr_threads) / r->nr_queues );
+        stalled = 0;
+        
+        /* Set up the local list of active queues. */
+        naq = r->nr_queues;
+        for ( k = 0 ; k < naq ; k++ )
+            queues[k] = &r->queues[k];
+    
+        /* Set up the local list of active queues. */
+        naq = r->nr_queues;
+        for ( k = 0 ; k < naq ; k++ )
+            queues[k] = &r->queues[k];
+    
+        /* Loop while there are tasks... */
+        while ( 1 ) {
+        
+            /* Remove any inactive queues. */
+            for ( k = 0 ; k < naq ; k++ )
+                if ( queues[k]->next == queues[k]->count ) {
+                    naq -= 1;
+                    queues[k] = queues[naq];
+                    k -= 1;
+                    }
+            if ( naq == 0 )
+                break;
+        
+            /* Get a task, how and from where depends on the policy. */
+            TIMER_TIC
+            t = NULL;
+            if ( r->nr_queues == 1 ) {
+                t = queue_gettask( &r->queues[0] , 1 , 0 );
+                }
+            else if ( r->policy & runner_policy_steal ) {
+                if ( ( myq->next == myq->count ) ||
+                     ( t = queue_gettask( myq , 0 , 0 ) ) == NULL ) {
+                    TIMER_TIC2
+                    qid = rand() % naq;
+                    keep = ( r->policy & runner_policy_keep ) &&
+                           ( myq->count <= myq->size-tpq );
+                    if ( myq->next == myq->count )
+                        COUNT(runner_counter_steal_empty);
+                    else
+                        COUNT(runner_counter_steal_stall);
+                    t = queue_gettask( queues[qid] , 0 , keep );
+                    if ( t != NULL && keep ) {
+                        COUNT(runner_counter_keep);
+                        if ( lock_lock( &myq->lock ) != 0 )
+                            error( "Failed to get queue lock." );
+                        for ( k = myq->count ; k > myq->next ; k-- )
+                            myq->tid[k] = myq->tid[k-1];
+                        myq->tid[ myq->next ] = t - s->tasks;
+                        myq->count += 1;
+                        myq->next += 1;
+                        if ( lock_unlock( &myq->lock ) != 0 )
+                            error( "Failed to unlock queue." );
+                        }
+                    TIMER_TOC2(runner_timer_steal);
+                    }
+                }
+            else if ( r->policy & runner_policy_rand ) {
+                qid = rand() % naq;
+                t = queue_gettask( queues[qid] , r->policy & runner_policy_block , 0 );
+                }
+            else {
+                t = queue_gettask( &r->queues[threadID] , r->policy & runner_policy_block , 0 );
+                }
+            TIMER_TOC(runner_timer_getpair);
+            
+            /* Did I get anything? */
+            if ( t == NULL ) {
+                COUNT(runner_counter_stall);
+                if ( !stalled )
+                    stalled = getticks();
+                continue;
+                }
+            #ifdef TIMER
+            else if ( stalled ) {
+                stalled = getticks() - stalled;
+                __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled );
+                #ifdef TIMER_VERBOSE
+                    printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 );
+                    fflush(stdout);
+                #endif
+                stalled = 0;
+                }
+            #endif
+        
+            /* Get the cells. */
+            ci = t->ci;
+            cj = t->cj;
+            
+            /* Different types of tasks... */
+            switch ( t->type ) {
+                case tid_self:
+                    runner_doself( rt , ci );
+                    cell_unlocktree( ci );
+                    break;
+                case tid_pair:
+                    runner_dopair( rt , ci , cj );
+                    cell_unlocktree( ci );
+                    cell_unlocktree( cj );
+                    break;
+                case tid_sort:
+                    runner_dosort( rt , ci , t->flags );
+                    break;
+                case tid_sub:
+                    runner_dosub( rt , ci , cj , t->flags );
+                    cell_unlocktree( ci );
+                    if ( cj != NULL )
+                        cell_unlocktree( cj );
+                    break;
+                default:
+                    error( "Unknown task type." );
+                }
+                
+            t->done = 1;
+            
+            /* Resolve any dependencies. */
+            for ( k = 0 ; k < t->nr_unlock_tasks ; k++ )
+                __sync_fetch_and_sub( &t->unlock_tasks[k]->wait , 1 );
+            for ( k = 0 ; k < t->nr_unlock_cells ; k++ )
+                __sync_fetch_and_sub( &t->unlock_cells[k]->wait , 1 );
+        
+            } /* main loop. */
+            
+    	/* Any leftover stalls? */    
+        #ifdef TIMER
+        if ( stalled ) {
+            stalled = getticks() - stalled;
+            __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled );
+            #ifdef TIMER_VERBOSE
+                printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 );
+                fflush(stdout);
+            #endif
+            stalled = 0;
+            }
+        #endif
+            
+        }
+        
+    /* Be kind, rewind. */
+    return NULL;
+
+    }
+    
+
+/**
+ * @brief Let the #runner loose on the given #space.
+ *
+ * @param r The #runner.
+ * @param s The #space.
+ */
+ 
+void runner_run ( struct runner *r , int sort_queues ) {
+
+    int j, k;
+    struct space *s = r->s;
+    
+    /* Run throught the tasks and get all the waits right. */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        s->tasks[k].done = 0;
+        for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ )
+            s->tasks[k].unlock_tasks[j]->wait += 1;
+        for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ )
+            s->tasks[k].unlock_cells[j]->wait += 1;
+        }
+    
+    /* Re-set the queues.*/
+    if ( sort_queues ) {
+        #pragma omp parallel for default(none), shared(r)
+        for ( k = 0 ; k < r->nr_queues ; k++ ) {
+            queue_sort( &r->queues[k] );
+            r->queues[k].next = 0;
+            }
+        }
+    else
+        for ( k = 0 ; k < r->nr_queues ; k++ )
+            r->queues[k].next = 0;
+    
+    /* Cry havoc and let loose the dogs of war. */
+    r->barrier_count = -r->barrier_count;
+    if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 )
+        error( "Failed to broadcast barrier open condition." );
+        
+    /* Sit back and wait for the runner_threads to come home. */
+    while ( r->barrier_count < r->nr_threads )
+        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
+            error( "Error while waiting for barrier." );
+    
+    }
+    
+    
+/**
+ * @brief Get a task free of dependencies and conflicts.
+ *
+ * @param q The task #queue.
+ * @param blocking Block until access to the queue is granted.
+ * @param keep Remove the returned task from this queue.
+ */
+ 
+struct task *queue_gettask ( struct queue *q , int blocking , int keep ) {
+
+    int k, tid = -1, qcount, *qtid = q->tid;
+    lock_type *qlock = &q->lock;
+    struct task *qtasks = q->tasks, *res = NULL;
+    TIMER_TIC
+    
+    /* If there are no tasks, leave immediately. */
+    if ( q->next >= q->count ) {
+        TIMER_TOC(runner_timer_queue);
+        return NULL;
+        }
+
+    /* Main loop, while there are tasks... */
+    while ( q->next < q->count ) {
+    
+        /* Grab the task lock. */
+        // if ( blocking ) {
+            if ( lock_lock( qlock ) != 0 )
+                error( "Locking the task_lock failed.\n" );
+        //     }
+        // else {
+        //     if ( lock_trylock( qlock ) != 0 )
+        //         break;
+        //     }
+            
+        /* Loop over the remaining task IDs. */
+        qcount = q->count;
+        for ( k = q->next ; k < qcount ; k++ ) {
+        
+            /* Put a finger on the task. */
+            res = &qtasks[ qtid[k] ];
+            
+            /* Is this task blocked? */
+            if ( res->wait )
+                continue;
+            
+            /* Different criteria for different types. */
+            if ( res->type == tid_self || (res->type == tid_sub && res->cj == NULL) ) {
+                if ( res->ci->hold || cell_locktree( res->ci ) != 0 )
+                    continue;
+                }
+            else if ( res->type == tid_pair || (res->type == tid_sub && res->cj != NULL) ) {
+                if ( res->ci->hold || res->cj->hold || res->ci->wait || res->cj->wait )
+                    continue;
+                if ( cell_locktree( res->ci ) != 0 )
+                    continue;
+                if ( cell_locktree( res->cj ) != 0 ) {
+                    cell_unlocktree( res->ci );
+                    continue;
+                    }
+                }
+            
+            /* If we made it this far, we're safe. */
+            break;
+        
+            } /* loop over the task IDs. */
+            
+        /* Did we get a task? */
+        if ( k < qcount ) {
+        
+            /* Do we need to swap? */
+            if ( k != q->next )
+                COUNT(runner_counter_swap);
+        
+            /* get the task ID. */
+            tid = qtid[k];
+        
+            /* Remove the task? */
+            if ( keep ) {
+            
+                /* Bubble-up. */
+                q->count = qcount - 1;
+                for ( ; k < qcount ; k++)
+                    qtid[k] = qtid[k+1];
+            
+                }
+                
+            /* No, leave it in the queue. */
+            else {
+            
+                TIMER_TIC2
+
+                /* Bubble-down the task. */
+                while ( k > q->next ) {
+                    qtid[ k ] = qtid[ k-1 ];
+                    k -= 1;
+                    }
+                qtid[ q->next ] = tid;
+                
+                /* up the counter. */
+                q->next += 1;
+                
+                TIMER_TOC2(runner_timer_bubble);
+            
+                }
+            
+            }
+    
+        /* Release the task lock. */
+        if ( lock_unlock( qlock ) != 0 )
+            error( "Unlocking the task_lock failed.\n" );
+            
+        /* Leave? */
+        if ( tid >= 0 ) {
+            TIMER_TOC(runner_timer_queue);
+            return &qtasks[tid];
+            }
+        else if ( !blocking )
+            break;
+    
+        } /* while there are tasks. */
+        
+    /* No beef. */
+    TIMER_TOC(runner_timer_queue);
+    return NULL;
+
+    }
+
+
+/**
+ * @brief init a runner with the given number of threads, queues, and
+ *      the given policy.
+ *
+ * @param r The #runner.
+ * @param s The #space in which this #runner will run.
+ * @param nr_threads The number of threads to spawn.
+ * @param nr_queues The number of task queues to create.
+ * @param policy The queueing policy to use.
+ */
+ 
+void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy ) {
+
+    int k, qid, nrq;
+    
+    /* Store the values. */
+    r->s = s;
+    r->nr_threads = nr_threads;
+    r->nr_queues = nr_queues;
+    r->policy = policy;
+    
+    /* First of all, init the barrier and lock it. */
+    if ( pthread_mutex_init( &r->barrier_mutex , NULL ) != 0 )
+        error( "Failed to initialize barrier mutex." );
+    if ( pthread_cond_init( &r->barrier_cond , NULL ) != 0 )
+        error( "Failed to initialize barrier condition variable." );
+    if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 )
+        error( "Failed to lock barrier mutex." );
+    r->barrier_count = 0;
+    
+    /* Allocate the queues. */
+    if ( posix_memalign( (void *)(&r->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 )
+        error( "Failed to allocate queues." );
+    bzero( r->queues , nr_queues * sizeof(struct queue) );
+        
+    /* Init the queues. */
+    for ( k = 0 ; k < nr_queues ; k++ ) {
+        r->queues[k].size = s->nr_tasks;
+        if ( ( r->queues[k].tid = (int *)malloc( sizeof(int) * r->queues[k].size ) ) == NULL )
+            error( "Failed to allocate queue tids." );
+        r->queues[k].count = 0;
+        r->queues[k].next = 0;
+        if ( lock_init( &r->queues[k].lock ) != 0 )
+            error( "Failed to init queue lock." );
+        r->queues[k].tasks = s->tasks;
+        r->queues[k].r = r;
+        }
+        
+    /* Rank the tasks in topological order. */
+    runner_ranktasks( r );
+    
+    /* How many queues to fill initially? */
+    for ( nrq = 0 , k = nr_queues ; k > 0 ; k = k / 2 )
+        nrq += 1;
+        
+    /* Fill the queues (round-robin). */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        if ( s->tasks[ s->tasks_ind[k] ].type == tid_none)
+            continue;
+        // qid = 0;
+        // qid = k % nrq;
+        qid = k % nr_queues;
+        r->queues[qid].tid[ r->queues[qid].count ] = s->tasks_ind[k];
+        r->queues[qid].count += 1;
+        }
+        
+    /* Sort the queues topologically. */
+    for ( k = 0 ; k < nr_queues ; k++ )
+        queue_sort( &r->queues[k] );
+        
+    /* Allocate and init the threads. */
+    if ( ( r->threads = (struct runner_thread *)malloc( sizeof(struct runner_thread) * nr_threads ) ) == NULL )
+        error( "Failed to allocate threads array." );
+    for ( k = 0 ; k < nr_threads ; k++ ) {
+        r->threads[k].id = k;
+        r->threads[k].r = r;
+        if ( pthread_create( &r->threads[k].thread , NULL , &runner_main , &r->threads[k] ) != 0 )
+            error( "Failed to create runner thread." );
+        }
+        
+    /* Wait for the runner threads to be in place. */
+    while ( r->barrier_count != r->nr_threads )
+        if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 )
+            error( "Error while waiting for runner threads to get in place." );
+    
+    }
+    
+    
+    
diff --git a/runner.h b/runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f233ada759ee8bcb472a26aee7db6eb18fadb65
--- /dev/null
+++ b/runner.h
@@ -0,0 +1,212 @@
+
+/* Some constants. */
+#define runner_policy_none          0
+#define runner_policy_rand          1
+#define runner_policy_steal         2
+#define runner_policy_keep          4
+#define runner_policy_block         8
+
+#define runner_queue_scale          1.2
+
+
+/* The timers themselves. */
+enum {
+    runner_timer_none = 0,
+    runner_timer_dosort,
+    runner_timer_doself,
+    runner_timer_dopair,
+    runner_timer_dosub,
+    runner_timer_getpair,
+    runner_timer_queue,
+    runner_timer_tree,
+    runner_timer_bubble,
+    runner_timer_steal,
+    runner_timer_stalled,
+    runner_timer_count,
+    };
+extern ticks runner_timer[ runner_timer_count ];
+
+
+/* Define the timer macros. */
+#ifdef TIMER_VERBOSE
+    #define TIMER
+#endif
+#ifdef TIMER
+    #define TIMER_TIC ticks tic = getticks();
+    #define TIMER_TOC(t) timer_toc( t , tic )
+    #define TIMER_TIC2 ticks tic2 = getticks();
+    #define TIMER_TOC2(t) timer_toc( t , tic2 )
+    #ifndef INLINE
+    # if __GNUC__ && !__GNUC_STDC_INLINE__
+    #  define INLINE extern inline
+    # else
+    #  define INLINE inline
+    # endif
+    #endif
+    INLINE ticks timer_toc ( int t , ticks tic ) {
+        ticks d = (getticks() - tic);
+        __sync_add_and_fetch( &runner_timer[t] , d );
+        return d;
+        }
+#else
+    #define TIMER_TIC
+    #define TIMER_TOC(t)
+#endif
+
+
+/* Counters. */
+enum {
+    runner_counter_swap = 0,
+    runner_counter_stall,
+    runner_counter_steal_stall,
+    runner_counter_steal_empty,
+    runner_counter_keep,
+    runner_counter_count,
+    };
+extern int runner_counter[ runner_counter_count ];
+
+
+/* Counter macros. */
+#ifdef COUNTER
+    #define COUNT(c) ( __sync_add_and_fetch( &runner_counter[ c ] , 1 ) )
+#else
+    #define COUNT(c)
+#endif
+
+/* Get the inlining right. */
+#ifndef INLINE
+# if __GNUC__ && !__GNUC_STDC_INLINE__
+#  define INLINE extern inline
+# else
+#  define INLINE inline
+# endif
+#endif
+
+
+/**
+ * @brief Compute the 'interaction' between two particles.
+ *
+ * @param r2 the inter-particle radius squared.
+ * @param hi the screening distance of the ith particle.
+ * @param hj the screening distance of the jth particle.
+ * @param io Pointer to where to store the interaction of the ith particle.
+ * @param jo Pointer to where to store the interaction of the ith particle.
+ */
+ 
+__attribute__ ((always_inline)) INLINE void iact ( float r2 , float hi , float hj , float *io , float *jo , int *ic , int *jc ) {
+
+    #define  KERNEL_COEFF_1  2.546479089470f
+    #define  KERNEL_COEFF_2  15.278874536822f
+    #define  KERNEL_COEFF_3  45.836623610466f
+    #define  KERNEL_COEFF_4  30.557749073644f
+    #define  KERNEL_COEFF_5  5.092958178941f
+    #define  KERNEL_COEFF_6  (-15.278874536822f)
+    #define  NORM_COEFF      4.188790204786f
+
+    float r = sqrtf( r2 );
+    float ui, uj, wi, wj;
+    
+    if ( r2 < hi*hi ) {
+        
+        ui = r / hi;
+        if ( ui < 0.5 )
+            wi = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (ui - 1.0f) * ui * ui;
+        else
+            wi = KERNEL_COEFF_5 * (1.0f - ui) * (1.0f - ui) * (1.0 - ui);
+        if ( io != NULL )
+            *io += NORM_COEFF * wi;
+        if ( ic != NULL )
+            *ic += 1;
+        
+        }
+
+    if ( r2 < hj*hj ) {
+        
+        uj = r / hj;
+        if ( uj < 0.5 )
+            wj = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (uj - 1.0f) * uj * uj;
+        else
+            wj = KERNEL_COEFF_5 * (1.0f - uj) * (1.0f - uj) * (1.0 - uj);
+        if ( jo != NULL )
+            *jo += NORM_COEFF * wj;
+        if ( jc != NULL )
+            *jc += 1;
+            
+        }
+    
+    }
+    
+
+
+/* A task queue. */
+struct queue {
+
+    /* The lock to access this queue. */
+    lock_type lock;
+
+    /* Size, count and next element. */
+    int size, count, next;
+    
+    /* The runner in which this queue lives. */
+    struct runner *r;
+    
+    /* The actual tasks to which the indices refer. */
+    struct task *tasks;
+    
+    /* The task indices. */
+    int *tid;
+
+    } __attribute__((aligned (64)));
+    
+
+/* A struct representing a runner's thread and its data. */
+struct runner_thread {
+
+    /* The id of this thread. */
+    int id;
+
+    /* The thread which it is running. */
+    pthread_t thread;
+    
+    /* The underlying runner. */
+    struct runner *r;
+    
+    };
+
+
+/* Data structure for the runner. */
+struct runner {
+
+    /* Number of threads on which to run. */
+    int nr_threads;
+    
+    /* The space with which the runner is associated. */
+    struct space *s;
+    
+    /* The runner's threads. */
+    struct runner_thread *threads;
+    
+    /* The running policy. */
+    int policy;
+    
+    /* The number of queues. */
+    int nr_queues;
+    
+    /* The queues. */
+    struct queue *queues;
+    
+    /* Data for the threads' barrier. */
+    pthread_mutex_t barrier_mutex;
+    pthread_cond_t barrier_cond;
+    int barrier_count;
+    
+    };
+
+
+/* Function prototypes. */
+void runner_run ( struct runner *r , int sort_queues );
+void runner_dopair ( struct runner_thread *rt , struct cell *ci , struct cell *cj );
+void runner_doself ( struct runner_thread *rt , struct cell *c );
+void runner_dosort ( struct runner_thread *rt , struct cell *c , int flag );
+void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy );
+struct task *queue_gettask ( struct queue *q , int blocking , int keep );
diff --git a/snap_C09/extract.m b/snap_C09/extract.m
new file mode 100644
index 0000000000000000000000000000000000000000..c167dfef1eb072d9b3911c15260e8f24043fd7dd
--- /dev/null
+++ b/snap_C09/extract.m
@@ -0,0 +1,35 @@
+
+% Remove any old data
+!rm -f *.txt
+
+% Loop over the input files
+range = [ Inf , -Inf ];
+avg = [ 0 , 0 , 0 ];
+count = 0;
+shift = [ 4.331456e+01 , 4.097030e+01 , 4.383432e+01 ];
+for i=0:15
+
+    % Get the file name
+    fname = sprintf( 'snap_C09_200_000.%i.hdf5' , i );
+
+    % Get the coordinates
+    coord = double( h5read( fname , '/PartType0/Coordinates' )' );
+    coord = coord - repmat( shift , size( coord , 1 ) , 1 );
+    save Coordinates.txt -ascii -double -append coord
+    
+    % Adjust the range
+    range(1) = min( range(1) , min(min(coord)) );
+    range(2) = max( range(2) , max(max(coord)) );
+    avg = avg + sum( coord , 1 );
+    count = count + size( coord , 1 );
+    
+    % Get the smoothing lengths
+    h = double( h5read( fname , '/PartType0/SmoothingLength' ) );
+    save SmoothingLength.txt -ascii -double -append h
+    
+end
+
+% Display some statistics
+disp( sprintf( 'read %i particles' , count ) );
+disp( sprintf( 'range of coords is [ %e , %e ]' , range(1) , range(2) ) );
+disp( sprintf( 'avg position is [ %e , %e , %e ]' , avg(1)/count , avg(2)/count , avg(3)/count ) );
diff --git a/space.c b/space.c
new file mode 100644
index 0000000000000000000000000000000000000000..afed2e76a79d170e3d6cb1a029260017c8535bc1
--- /dev/null
+++ b/space.c
@@ -0,0 +1,1313 @@
+
+
+/* Some standard headers. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+/* Local headers. */
+#include "cycle.h"
+#include "lock.h"
+#include "space.h"
+#include "runner.h"
+
+/* Error macro. */
+#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); }
+
+/* Convert cell location to ID. */
+#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) )
+
+/* Task type names. */
+const char *taskID_names[tid_count] = { "none" , "sort" , "self" , "pair" , "sub" };
+
+/* Map shift vector to sortlist. */
+const int sortlistID[27] = {
+    /* ( -1 , -1 , -1 ) */   0 ,
+    /* ( -1 , -1 ,  0 ) */   1 , 
+    /* ( -1 , -1 ,  1 ) */   2 ,
+    /* ( -1 ,  0 , -1 ) */   3 ,
+    /* ( -1 ,  0 ,  0 ) */   4 , 
+    /* ( -1 ,  0 ,  1 ) */   5 ,
+    /* ( -1 ,  1 , -1 ) */   6 ,
+    /* ( -1 ,  1 ,  0 ) */   7 , 
+    /* ( -1 ,  1 ,  1 ) */   8 ,
+    /* (  0 , -1 , -1 ) */   9 ,
+    /* (  0 , -1 ,  0 ) */   10 , 
+    /* (  0 , -1 ,  1 ) */   11 ,
+    /* (  0 ,  0 , -1 ) */   12 ,
+    /* (  0 ,  0 ,  0 ) */   0 , 
+    /* (  0 ,  0 ,  1 ) */   12 ,
+    /* (  0 ,  1 , -1 ) */   11 ,
+    /* (  0 ,  1 ,  0 ) */   10 , 
+    /* (  0 ,  1 ,  1 ) */   9 ,
+    /* (  1 , -1 , -1 ) */   8 ,
+    /* (  1 , -1 ,  0 ) */   7 , 
+    /* (  1 , -1 ,  1 ) */   6 ,
+    /* (  1 ,  0 , -1 ) */   5 ,
+    /* (  1 ,  0 ,  0 ) */   4 , 
+    /* (  1 ,  0 ,  1 ) */   3 ,
+    /* (  1 ,  1 , -1 ) */   2 ,
+    /* (  1 ,  1 ,  0 ) */   1 , 
+    /* (  1 ,  1 ,  1 ) */   0 
+    };
+    
+    
+/**
+ * @brief Mapping function to draw a specific cell (gnuplot).
+ */
+
+void space_map_clearsort ( struct cell *c , void *data ) {
+
+    if ( c->sort != NULL ) {
+        free( c->sort );
+        c->sort = NULL;
+        }
+
+    }
+
+
+/**
+ * @brief Get a task free of dependencies and conflicts.
+ *
+ * @param s The #space.
+ */
+ 
+struct task *space_gettask ( struct space *s ) {
+
+    int k, tid = -1;
+    struct task *res = NULL;
+    struct cell *c;
+
+    /* Main loop, while there are tasks... */
+    while ( s->next_task < s->nr_tasks ) {
+    
+        /* Grab the task lock. */
+        if ( lock_lock( &s->task_lock ) != 0 )
+            error( "Locking the task_lock failed.\n" );
+            
+        /* Loop over the remaining task IDs. */
+        for ( k = s->next_task ; k < s->nr_tasks ; k++ ) {
+        
+            /* Put a finger on the task. */
+            res = &s->tasks[ s->tasks_ind[k] ];
+            
+            /* Is this task blocked? */
+            if ( res->wait )
+                continue;
+            
+            /* Different criteria for different types. */
+            switch ( res->type ) {
+                case tid_self:
+                    if ( res->ci->lock || res->ci->wait > 0 )
+                        continue;
+                    break;
+                case tid_pair:
+                    if ( res->ci->lock || res->cj->lock || res->ci->wait || res->cj->wait )
+                        continue;
+                    break;
+                case tid_sort:
+                    if ( res->ci->lock )
+                        continue;
+                    break;
+                }
+            
+            /* If we made it this far, we're safe. */
+            break;
+        
+            } /* loop over the task IDs. */
+            
+        /* Did we get a task? */
+        if ( k < s->nr_tasks ) {
+        
+            // /* Swap to front. */
+            // tid = s->tasks_ind[k];
+            // s->tasks_ind[k] = s->tasks_ind[ s->next_task ];
+            // s->tasks_ind[ s->next_task ] = tid;
+            
+            /* Bubble-down the task. */
+            tid = s->tasks_ind[k];
+            while ( k > s->next_task ) {
+                s->tasks_ind[ k ] = s->tasks_ind[ k-1 ];
+                k -= 1;
+                }
+            s->tasks_ind[ s->next_task ] = tid;
+            
+            /* Lock the cells, if needed. */
+            if ( s->tasks[tid].type != tid_sort ) {
+                for ( c = res->ci ; c != NULL ; c = c->parent )
+                    __sync_fetch_and_add( &c->lock , 1 );
+                for ( c = res->cj ; c != NULL ; c = c->parent )
+                    __sync_fetch_and_add( &c->lock , 1 );
+                }
+            
+            /* up the counter. */
+            s->next_task += 1;
+        
+            }
+    
+        /* Release the task lock. */
+        if ( lock_unlock( &s->task_lock ) != 0 )
+            error( "Locking the task_lock failed.\n" );
+            
+        /* Leave? */
+        if ( tid >= 0 )
+            return &s->tasks[tid];
+    
+        } /* while there are tasks. */
+        
+    /* No beef. */
+    return NULL;
+
+    }
+
+
+/**
+ * @brief Map a function to all particles in a aspace.
+ *
+ * @param s The #space we are working in.
+ */
+ 
+void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data ) {
+
+    int i;
+
+    void rec_map ( struct cell *c ) {
+    
+        int k;
+        
+        /* No progeny? */
+        if ( !c->split )
+            for ( k = 0 ; k < c->count ; k++ )
+                fun( &c->parts[k] , c , data );
+                
+        /* Otherwise, recurse. */
+        else
+            for ( k = 0 ; k < 8 ; k++ )
+                if ( c->progeny[k] != NULL )
+                    rec_map( c->progeny[k] );
+                
+        }
+        
+    /* Call the recursive function on all higher-level cells. */
+    for ( i = 0 ; i < s->nr_cells ; i++ )
+        rec_map( &s->cells[i] );
+
+    }
+
+
+/**
+ * @brief Map a function to all particles in a aspace.
+ *
+ * @param s The #space we are working in.
+ */
+ 
+void space_map_cells ( struct space *s , void (*fun)( struct cell *c , void *data ) , void *data ) {
+
+    int i;
+
+    void rec_map ( struct cell *c ) {
+    
+        int k;
+        
+        /* No progeny? */
+        if ( !c->split )
+            fun( c , data );
+                
+        /* Otherwise, recurse. */
+        else
+            for ( k = 0 ; k < 8 ; k++ )
+                if ( c->progeny[k] != NULL )
+                    rec_map( c->progeny[k] );
+                
+        }
+        
+    /* Call the recursive function on all higher-level cells. */
+    for ( i = 0 ; i < s->nr_cells ; i++ )
+        rec_map( &s->cells[i] );
+
+    }
+
+
+/**
+ * @brief Add a #task to the #space.
+ *
+ * @param s The #space we are working in.
+ */
+ 
+struct task *space_addtask ( struct space *s , int type , int flags , int wait , struct cell *ci , struct cell *cj , struct task *unlock_tasks[] , int nr_unlock_tasks , struct cell *unlock_cells[] , int nr_unlock_cells ) {
+
+    struct task *t = &s->tasks[ s->nr_tasks ];
+    
+    /* Copy the data. */
+    t->type = type;
+    t->flags = flags;
+    t->wait = wait;
+    t->ci = ci;
+    t->cj = cj;
+    if ( unlock_tasks != NULL )
+        memcpy( t->unlock_tasks , unlock_tasks , sizeof(struct task *) * nr_unlock_tasks );
+    t->nr_unlock_tasks = nr_unlock_tasks;
+    if ( unlock_cells != NULL )
+        memcpy( t->unlock_cells , unlock_cells , sizeof(struct task *) * nr_unlock_cells );
+    t->nr_unlock_cells = nr_unlock_cells;
+    
+    /* Increase the task counter. */
+    s->nr_tasks += 1;
+    
+    /* Return a pointer to the new task. */
+    return t;
+
+    }
+
+
+
+/**
+ * @brief Remove an unlock_task from the given task.
+ *
+ * @param ta The unlocking #task.
+ * @param tb The #task that will be unlocked.
+ */
+ 
+void task_rmunlock( struct task *ta , struct task *tb ) {
+
+    int k;
+    
+    for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ )
+        if ( ta->unlock_tasks[k] == tb ) {
+            ta->nr_unlock_tasks -= 1;
+            ta->unlock_tasks[k] = ta->unlock_tasks[ ta->nr_unlock_tasks ];
+            return;
+            }
+    error( "Task not found." );
+
+    }
+    
+
+/**
+ * @brief Add an unlock_task to the given task.
+ *
+ * @param ta The unlocking #task.
+ * @param tb The #task that will be unlocked.
+ */
+ 
+void task_addunlock( struct task *ta , struct task *tb ) {
+
+    int k;
+    
+    /* Bogus? */
+    if ( ta == NULL || tb == NULL )
+        return;
+    
+    /* Check if ta already unlocks tb. */
+    for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ )
+        if ( ta->unlock_tasks[k] == tb )
+            return;
+
+    if ( ta->nr_unlock_tasks == task_maxunlock )
+        error( "Too many unlock_tasks in task." );
+        
+    ta->unlock_tasks[ ta->nr_unlock_tasks] = tb;
+    ta->nr_unlock_tasks += 1;
+
+    }
+    
+
+/**
+ * @brief Split tasks that may be too large.
+ *
+ * @param s The #space we are working in.
+ */
+ 
+void space_splittasks ( struct space *s ) {
+
+    int k, sid, tid;
+    struct cell *ci, *cj;
+    double hi, hj, shift[3];
+    struct task *t;
+
+    /* Loop through the tasks... */
+    for ( tid = 0 ; tid < s->nr_tasks ; tid++ ) {
+    
+        /* Get a pointer on the task. */
+        t = &s->tasks[tid];
+    
+        /* If this task isn't a pair, i'm not interested. */
+        if ( t->type != tid_pair )
+            continue;
+            
+        /* Get a handle on the cells involved. */
+        ci = t->ci;
+        cj = t->cj;
+        hi = fmax( ci->h[0] , fmax( ci->h[1] , ci->h[2] ) );
+        hj = fmax( cj->h[0] , fmax( cj->h[1] , cj->h[2] ) );
+            
+        /* Should this task be split-up? */
+        if ( ci->split && cj->split && ci->r_max < hi/2 && cj->r_max < hj/2 ) {
+        
+            /* Get the relative distance between the pairs, wrapping. */
+            for ( k = 0 ; k < 3 ; k++ ) {
+                if ( cj->loc[k] - ci->loc[k] < -s->dim[k]/2 )
+                    shift[k] = s->dim[k];
+                else if ( cj->loc[k] - ci->loc[k] > s->dim[k]/2 )
+                    shift[k] = -s->dim[k];
+                else
+                    shift[k] = 0.0;
+                }
+
+            /* Get the sorting index. */
+            for ( sid = 0 , k = 0 ; k < 3 ; k++ )
+                sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 );
+
+            /* Flip? */
+            if ( sid < 13 ) {
+                cj = t->ci;
+                ci = t->cj;
+                t->ci = ci; t->cj = cj;
+                }
+            else
+                sid = 26 - sid;
+                
+            /* Remove the dependency of this task on the sorts of ci and cj. */
+            task_rmunlock( ci->sorts[sid] , t );
+            task_rmunlock( cj->sorts[sid] , t );
+            ci->nr_pairs -= 1;
+            cj->nr_pairs -= 1;
+            t->nr_unlock_cells = 0;
+                
+            /* For each different sorting type... */
+            switch ( sid ) {
+            
+                case 0: /* (  1 ,  1 ,  1 ) */
+                    t->ci = ci->progeny[7]; t->cj = cj->progeny[0];
+                    task_addunlock( ci->progeny[7]->sorts[0] , t ); task_addunlock( cj->progeny[0]->sorts[0] , t );
+                    ci->progeny[7]->nr_pairs += 1;
+                    cj->progeny[0]->nr_pairs += 1;
+                    break;
+                    
+                case 1: /* (  1 ,  1 ,  0 ) */
+                    if ( !ci->progeny[6]->split && !ci->progeny[7]->split &&
+                         !cj->progeny[0]->split && !cj->progeny[1]->split ) {
+                        t->type = tid_sub; t->flags = 1;
+                        task_addunlock( ci->progeny[6]->sorts[1] , t ); task_addunlock( cj->progeny[0]->sorts[1] , t );
+                        task_addunlock( ci->progeny[7]->sorts[1] , t ); task_addunlock( cj->progeny[1]->sorts[1] , t );
+                        task_addunlock( ci->progeny[6]->sorts[0] , t ); task_addunlock( cj->progeny[1]->sorts[0] , t );
+                        task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[6]; t->cj = cj->progeny[0];
+                        task_addunlock( ci->progeny[6]->sorts[1] , t ); task_addunlock( cj->progeny[0]->sorts[1] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[1] , t ); task_addunlock( cj->progeny[1]->sorts[1] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[0] , t ); task_addunlock( cj->progeny[1]->sorts[0] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t );
+                        }
+                    ci->progeny[6]->nr_pairs += 2;
+                    ci->progeny[7]->nr_pairs += 2;
+                    cj->progeny[0]->nr_pairs += 2;
+                    cj->progeny[1]->nr_pairs += 2;
+                    break;
+                    
+                case 2: /* (  1 ,  1 , -1 ) */
+                    t->ci = ci->progeny[6]; t->cj = cj->progeny[1];
+                    task_addunlock( ci->progeny[6]->sorts[2] , t ); task_addunlock( cj->progeny[1]->sorts[2] , t );
+                    ci->progeny[6]->nr_pairs += 1;
+                    cj->progeny[1]->nr_pairs += 1;
+                    break;
+                    
+                case 3: /* (  1 ,  0 ,  1 ) */
+                    if ( !ci->progeny[5]->split && !ci->progeny[7]->split &&
+                         !cj->progeny[0]->split && !cj->progeny[2]->split ) {
+                        t->type = tid_sub; t->flags = 3;
+                        task_addunlock( ci->progeny[5]->sorts[3] , t ); task_addunlock( cj->progeny[0]->sorts[3] , t );
+                        task_addunlock( ci->progeny[7]->sorts[3] , t ); task_addunlock( cj->progeny[2]->sorts[3] , t );
+                        task_addunlock( ci->progeny[5]->sorts[0] , t ); task_addunlock( cj->progeny[2]->sorts[0] , t );
+                        task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[5]; t->cj = cj->progeny[0];
+                        task_addunlock( ci->progeny[5]->sorts[3] , t ); task_addunlock( cj->progeny[0]->sorts[3] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[3] , t ); task_addunlock( cj->progeny[2]->sorts[3] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[0] , t ); task_addunlock( cj->progeny[2]->sorts[0] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t );
+                        }
+                    ci->progeny[5]->nr_pairs += 2;
+                    ci->progeny[7]->nr_pairs += 2;
+                    cj->progeny[0]->nr_pairs += 2;
+                    cj->progeny[2]->nr_pairs += 2;
+                    break;
+                    
+                case 4: /* (  1 ,  0 ,  0 ) */
+                    if ( !ci->progeny[4]->split && !ci->progeny[5]->split && !ci->progeny[6]->split && !ci->progeny[7]->split &&
+                         !cj->progeny[0]->split && !cj->progeny[1]->split && !cj->progeny[2]->split && !cj->progeny[3]->split) {
+                        t->type = tid_sub; t->flags = 4;
+                        task_addunlock( ci->progeny[4]->sorts[4] , t ); task_addunlock( cj->progeny[0]->sorts[4] , t );
+                        task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t );
+                        task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t );
+                        task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t );
+                        task_addunlock( ci->progeny[4]->sorts[3] , t ); task_addunlock( cj->progeny[1]->sorts[3] , t );
+                        task_addunlock( ci->progeny[5]->sorts[4] , t ); task_addunlock( cj->progeny[1]->sorts[4] , t );
+                        task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t );
+                        task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t );
+                        task_addunlock( ci->progeny[4]->sorts[1] , t ); task_addunlock( cj->progeny[2]->sorts[1] , t );
+                        task_addunlock( ci->progeny[5]->sorts[2] , t ); task_addunlock( cj->progeny[2]->sorts[2] , t );
+                        task_addunlock( ci->progeny[6]->sorts[4] , t ); task_addunlock( cj->progeny[2]->sorts[4] , t );
+                        task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t );
+                        task_addunlock( ci->progeny[4]->sorts[0] , t ); task_addunlock( cj->progeny[3]->sorts[0] , t );
+                        task_addunlock( ci->progeny[5]->sorts[1] , t ); task_addunlock( cj->progeny[3]->sorts[1] , t );
+                        task_addunlock( ci->progeny[6]->sorts[3] , t ); task_addunlock( cj->progeny[3]->sorts[3] , t );
+                        task_addunlock( ci->progeny[7]->sorts[4] , t ); task_addunlock( cj->progeny[3]->sorts[4] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[4]; t->cj = cj->progeny[0];
+                        task_addunlock( ci->progeny[4]->sorts[4] , t ); task_addunlock( cj->progeny[0]->sorts[4] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[4]->sorts[3] , t ); task_addunlock( cj->progeny[1]->sorts[3] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[4] , t ); task_addunlock( cj->progeny[1]->sorts[4] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[4]->sorts[1] , t ); task_addunlock( cj->progeny[2]->sorts[1] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[2] , t ); task_addunlock( cj->progeny[2]->sorts[2] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[4] , t ); task_addunlock( cj->progeny[2]->sorts[4] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[4]->sorts[0] , t ); task_addunlock( cj->progeny[3]->sorts[0] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[1] , t ); task_addunlock( cj->progeny[3]->sorts[1] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[3] , t ); task_addunlock( cj->progeny[3]->sorts[3] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[4] , t ); task_addunlock( cj->progeny[3]->sorts[4] , t );
+                        }
+                    ci->progeny[4]->nr_pairs += 4;
+                    ci->progeny[5]->nr_pairs += 4;
+                    ci->progeny[6]->nr_pairs += 4;
+                    ci->progeny[7]->nr_pairs += 4;
+                    cj->progeny[0]->nr_pairs += 4;
+                    cj->progeny[1]->nr_pairs += 4;
+                    cj->progeny[2]->nr_pairs += 4;
+                    cj->progeny[3]->nr_pairs += 4;
+                    break;
+                    
+                case 5: /* (  1 ,  0 , -1 ) */
+                    if ( !ci->progeny[4]->split && !ci->progeny[6]->split &&
+                         !cj->progeny[1]->split && !cj->progeny[3]->split ) {
+                        t->type = tid_sub; t->flags = 5;
+                        task_addunlock( ci->progeny[4]->sorts[5] , t ); task_addunlock( cj->progeny[1]->sorts[5] , t );
+                        task_addunlock( ci->progeny[6]->sorts[5] , t ); task_addunlock( cj->progeny[3]->sorts[5] , t );
+                        task_addunlock( ci->progeny[4]->sorts[2] , t ); task_addunlock( cj->progeny[3]->sorts[2] , t );
+                        task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[4]; t->cj = cj->progeny[1];
+                        task_addunlock( ci->progeny[4]->sorts[5] , t ); task_addunlock( cj->progeny[1]->sorts[5] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[5] , t ); task_addunlock( cj->progeny[3]->sorts[5] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[4]->sorts[2] , t ); task_addunlock( cj->progeny[3]->sorts[2] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t );
+                        }
+                    ci->progeny[4]->nr_pairs += 2;
+                    ci->progeny[6]->nr_pairs += 2;
+                    cj->progeny[1]->nr_pairs += 2;
+                    cj->progeny[3]->nr_pairs += 2;
+                    break;
+                    
+                case 6: /* (  1 , -1 ,  1 ) */
+                    t->ci = ci->progeny[5]; t->cj = cj->progeny[2];
+                    task_addunlock( ci->progeny[5]->sorts[6] , t ); task_addunlock( cj->progeny[2]->sorts[6] , t );
+                    ci->progeny[5]->nr_pairs += 1;
+                    cj->progeny[2]->nr_pairs += 1;
+                    break;
+                    
+                case 7: /* (  1 , -1 ,  0 ) */
+                    if ( !ci->progeny[4]->split && !ci->progeny[5]->split &&
+                         !cj->progeny[2]->split && !cj->progeny[3]->split ) {
+                        t->type = tid_sub; t->flags = 7;
+                        task_addunlock( ci->progeny[4]->sorts[6] , t ); task_addunlock( cj->progeny[3]->sorts[6] , t );
+                        task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t );
+                        task_addunlock( ci->progeny[4]->sorts[7] , t ); task_addunlock( cj->progeny[2]->sorts[7] , t );
+                        task_addunlock( ci->progeny[5]->sorts[7] , t ); task_addunlock( cj->progeny[3]->sorts[7] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[4]; t->cj = cj->progeny[3];
+                        task_addunlock( ci->progeny[4]->sorts[6] , t ); task_addunlock( cj->progeny[3]->sorts[6] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[4] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[4]->sorts[7] , t ); task_addunlock( cj->progeny[2]->sorts[7] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[3] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[7] , t ); task_addunlock( cj->progeny[3]->sorts[7] , t );
+                        }
+                    ci->progeny[4]->nr_pairs += 2;
+                    ci->progeny[5]->nr_pairs += 2;
+                    cj->progeny[2]->nr_pairs += 2;
+                    cj->progeny[3]->nr_pairs += 2;
+                    break;
+                    
+                case 8: /* (  1 , -1 , -1 ) */
+                    t->ci = ci->progeny[4]; t->cj = cj->progeny[3];
+                    task_addunlock( ci->progeny[4]->sorts[8] , t ); task_addunlock( cj->progeny[3]->sorts[8] , t );
+                    ci->progeny[4]->nr_pairs += 1;
+                    cj->progeny[3]->nr_pairs += 1;
+                    break;
+                    
+                case 9: /* (  0 ,  1 ,  1 ) */
+                    if ( !ci->progeny[3]->split && !ci->progeny[7]->split &&
+                         !cj->progeny[0]->split && !cj->progeny[4]->split ) {
+                        t->type = tid_sub; t->flags = 9;
+                        task_addunlock( ci->progeny[3]->sorts[9] , t ); task_addunlock( cj->progeny[0]->sorts[9] , t );
+                        task_addunlock( ci->progeny[7]->sorts[9] , t ); task_addunlock( cj->progeny[4]->sorts[9] , t );
+                        task_addunlock( ci->progeny[3]->sorts[0] , t ); task_addunlock( cj->progeny[4]->sorts[0] , t );
+                        task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[3]; t->cj = cj->progeny[0];
+                        task_addunlock( ci->progeny[3]->sorts[9] , t ); task_addunlock( cj->progeny[0]->sorts[9] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[9] , t ); task_addunlock( cj->progeny[4]->sorts[9] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[0] , t ); task_addunlock( cj->progeny[4]->sorts[0] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[8] , t ); task_addunlock( cj->progeny[0]->sorts[8] , t );
+                        }
+                    ci->progeny[3]->nr_pairs += 2;
+                    ci->progeny[7]->nr_pairs += 2;
+                    cj->progeny[0]->nr_pairs += 2;
+                    cj->progeny[4]->nr_pairs += 2;
+                    break;
+                    
+                case 10: /* (  0 ,  1 ,  0 ) */
+                    if ( !ci->progeny[2]->split && !ci->progeny[3]->split && !ci->progeny[6]->split && !ci->progeny[7]->split &&
+                         !cj->progeny[0]->split && !cj->progeny[1]->split && !cj->progeny[4]->split && !cj->progeny[5]->split) {
+                        t->type = tid_sub; t->flags = 10;
+                        task_addunlock( ci->progeny[2]->sorts[10] , t ); task_addunlock( cj->progeny[0]->sorts[10] , t );
+                        task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t );
+                        task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t );
+                        task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t );
+                        task_addunlock( ci->progeny[2]->sorts[9] , t ); task_addunlock( cj->progeny[1]->sorts[9] , t );
+                        task_addunlock( ci->progeny[3]->sorts[10] , t ); task_addunlock( cj->progeny[1]->sorts[10] , t );
+                        task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t );
+                        task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t );
+                        task_addunlock( ci->progeny[2]->sorts[1] , t ); task_addunlock( cj->progeny[4]->sorts[1] , t );
+                        task_addunlock( ci->progeny[3]->sorts[2] , t ); task_addunlock( cj->progeny[4]->sorts[2] , t );
+                        task_addunlock( ci->progeny[6]->sorts[10] , t ); task_addunlock( cj->progeny[4]->sorts[10] , t );
+                        task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t );
+                        task_addunlock( ci->progeny[2]->sorts[0] , t ); task_addunlock( cj->progeny[5]->sorts[0] , t );
+                        task_addunlock( ci->progeny[3]->sorts[1] , t ); task_addunlock( cj->progeny[5]->sorts[1] , t );
+                        task_addunlock( ci->progeny[6]->sorts[9] , t ); task_addunlock( cj->progeny[5]->sorts[9] , t );
+                        task_addunlock( ci->progeny[7]->sorts[10] , t ); task_addunlock( cj->progeny[5]->sorts[10] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[2]; t->cj = cj->progeny[0];
+                        task_addunlock( ci->progeny[2]->sorts[10] , t ); task_addunlock( cj->progeny[0]->sorts[10] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[7] , t ); task_addunlock( cj->progeny[0]->sorts[7] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[6] , t ); task_addunlock( cj->progeny[0]->sorts[6] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[2]->sorts[9] , t ); task_addunlock( cj->progeny[1]->sorts[9] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[10] , t ); task_addunlock( cj->progeny[1]->sorts[10] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[8] , t ); task_addunlock( cj->progeny[1]->sorts[8] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[7] , t ); task_addunlock( cj->progeny[1]->sorts[7] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[2]->sorts[1] , t ); task_addunlock( cj->progeny[4]->sorts[1] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[2] , t ); task_addunlock( cj->progeny[4]->sorts[2] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[10] , t ); task_addunlock( cj->progeny[4]->sorts[10] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[5] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[2]->sorts[0] , t ); task_addunlock( cj->progeny[5]->sorts[0] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[5] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[1] , t ); task_addunlock( cj->progeny[5]->sorts[1] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[5] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[9] , t ); task_addunlock( cj->progeny[5]->sorts[9] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[5] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[10] , t ); task_addunlock( cj->progeny[5]->sorts[10] , t );
+                        }
+                    ci->progeny[2]->nr_pairs += 4;
+                    ci->progeny[3]->nr_pairs += 4;
+                    ci->progeny[6]->nr_pairs += 4;
+                    ci->progeny[7]->nr_pairs += 4;
+                    cj->progeny[0]->nr_pairs += 4;
+                    cj->progeny[1]->nr_pairs += 4;
+                    cj->progeny[4]->nr_pairs += 4;
+                    cj->progeny[5]->nr_pairs += 4;
+                    break;
+                    
+                case 11: /* (  0 ,  1 , -1 ) */
+                    if ( !ci->progeny[2]->split && !ci->progeny[6]->split &&
+                         !cj->progeny[1]->split && !cj->progeny[5]->split ) {
+                        t->type = tid_sub; t->flags = 11;
+                        task_addunlock( ci->progeny[2]->sorts[11] , t ); task_addunlock( cj->progeny[1]->sorts[11] , t );
+                        task_addunlock( ci->progeny[6]->sorts[11] , t ); task_addunlock( cj->progeny[5]->sorts[11] , t );
+                        task_addunlock( ci->progeny[2]->sorts[2] , t ); task_addunlock( cj->progeny[5]->sorts[2] , t );
+                        task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[2]; t->cj = cj->progeny[1];
+                        task_addunlock( ci->progeny[2]->sorts[11] , t ); task_addunlock( cj->progeny[1]->sorts[11] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[5] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[11] , t ); task_addunlock( cj->progeny[5]->sorts[11] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[2] , cj->progeny[5] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[2]->sorts[2] , t ); task_addunlock( cj->progeny[5]->sorts[2] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[6] , cj->progeny[1] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[6]->sorts[6] , t ); task_addunlock( cj->progeny[1]->sorts[6] , t );
+                        }
+                    ci->progeny[2]->nr_pairs += 2;
+                    ci->progeny[6]->nr_pairs += 2;
+                    cj->progeny[1]->nr_pairs += 2;
+                    cj->progeny[5]->nr_pairs += 2;
+                    break;
+                    
+                case 12: /* (  0 ,  0 ,  1 ) */
+                    if ( !ci->progeny[1]->split && !ci->progeny[3]->split && !ci->progeny[5]->split && !ci->progeny[7]->split &&
+                         !cj->progeny[0]->split && !cj->progeny[2]->split && !cj->progeny[4]->split && !cj->progeny[6]->split) {
+                        t->type = tid_sub; t->flags = 12;
+                        task_addunlock( ci->progeny[1]->sorts[12] , t ); task_addunlock( cj->progeny[0]->sorts[12] , t );
+                        task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t );
+                        task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t );
+                        task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t );
+                        task_addunlock( ci->progeny[1]->sorts[9] , t ); task_addunlock( cj->progeny[2]->sorts[9] , t );
+                        task_addunlock( ci->progeny[3]->sorts[12] , t ); task_addunlock( cj->progeny[2]->sorts[12] , t );
+                        task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t );
+                        task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t );
+                        task_addunlock( ci->progeny[1]->sorts[3] , t ); task_addunlock( cj->progeny[4]->sorts[3] , t );
+                        task_addunlock( ci->progeny[3]->sorts[6] , t ); task_addunlock( cj->progeny[4]->sorts[6] , t );
+                        task_addunlock( ci->progeny[5]->sorts[12] , t ); task_addunlock( cj->progeny[4]->sorts[12] , t );
+                        task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t );
+                        task_addunlock( ci->progeny[1]->sorts[0] , t ); task_addunlock( cj->progeny[6]->sorts[0] , t );
+                        task_addunlock( ci->progeny[3]->sorts[3] , t ); task_addunlock( cj->progeny[6]->sorts[3] , t );
+                        task_addunlock( ci->progeny[5]->sorts[9] , t ); task_addunlock( cj->progeny[6]->sorts[9] , t );
+                        task_addunlock( ci->progeny[7]->sorts[12] , t ); task_addunlock( cj->progeny[6]->sorts[12] , t );
+                        }
+                    else {
+                        t->ci = ci->progeny[1]; t->cj = cj->progeny[0];
+                        task_addunlock( ci->progeny[1]->sorts[12] , t ); task_addunlock( cj->progeny[0]->sorts[12] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[11] , t ); task_addunlock( cj->progeny[0]->sorts[11] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[5] , t ); task_addunlock( cj->progeny[0]->sorts[5] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[0] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[2] , t ); task_addunlock( cj->progeny[0]->sorts[2] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[1] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[1]->sorts[9] , t ); task_addunlock( cj->progeny[2]->sorts[9] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[12] , t ); task_addunlock( cj->progeny[2]->sorts[12] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[8] , t ); task_addunlock( cj->progeny[2]->sorts[8] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[2] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[5] , t ); task_addunlock( cj->progeny[2]->sorts[5] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[1] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[1]->sorts[3] , t ); task_addunlock( cj->progeny[4]->sorts[3] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[6] , t ); task_addunlock( cj->progeny[4]->sorts[6] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[12] , t ); task_addunlock( cj->progeny[4]->sorts[12] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[4] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[11] , t ); task_addunlock( cj->progeny[4]->sorts[11] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[1] , cj->progeny[6] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[1]->sorts[0] , t ); task_addunlock( cj->progeny[6]->sorts[0] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[3] , cj->progeny[6] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[3]->sorts[3] , t ); task_addunlock( cj->progeny[6]->sorts[3] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[5] , cj->progeny[6] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[5]->sorts[9] , t ); task_addunlock( cj->progeny[6]->sorts[9] , t );
+                        t = space_addtask( s , tid_pair , 0 , 0 , ci->progeny[7] , cj->progeny[6] , NULL , 0 , NULL , 0 );
+                        task_addunlock( ci->progeny[7]->sorts[12] , t ); task_addunlock( cj->progeny[6]->sorts[12] , t );
+                        }
+                    ci->progeny[1]->nr_pairs += 4;
+                    ci->progeny[3]->nr_pairs += 4;
+                    ci->progeny[5]->nr_pairs += 4;
+                    ci->progeny[7]->nr_pairs += 4;
+                    cj->progeny[0]->nr_pairs += 4;
+                    cj->progeny[2]->nr_pairs += 4;
+                    cj->progeny[4]->nr_pairs += 4;
+                    cj->progeny[6]->nr_pairs += 4;
+                    break;
+            
+                }
+                
+            /* Take a step back... */
+            tid -= 1;
+            
+            } /* split this task? */
+    
+        } /* loop over all tasks. */
+        
+    }
+    
+    
+/**
+ * @brief Fill the #space's task list.
+ *
+ * @param s The #space we are working in.
+ * @param do_sort Flag to add sorting tasks to the list.
+ */
+ 
+void space_maketasks ( struct space *s , int do_sort ) {
+
+    int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd;
+    int *cdim = s->cdim;
+    int nr_tasks_old = s->nr_tasks;
+    struct task *t;
+    int pts[7][8] = { { -1 , 12 , 10 , 9 , 4 , 3 , 1 , 0 } ,
+                      { -1 , -1 , 11 , 10 , 5 , 4 , 2 , 1 } ,
+                      { -1 , -1 , -1 , 12 , 7 , 6 , 4 , 3 } , 
+                      { -1 , -1 , -1 , -1 , 8 , 7 , 5 , 4 } ,
+                      { -1 , -1 , -1 , -1 , -1 , 12 , 10 , 9 } ,
+                      { -1 , -1 , -1 , -1 , -1 , -1 , 11 , 10 } ,
+                      { -1 , -1 , -1 , -1 , -1 , -1 , -1 , 12 } };
+    int counts[tid_count];
+
+    /* Recursive function to generate tasks in the cell tree. */
+    void maketasks_rec ( struct cell *c , struct task *sort_up[] , int nr_sort_up , struct cell *parent ) {
+
+        int j, k, nr_sort = 0;
+        struct task *sort[14], *t;
+
+        /* Clear the waits on this cell. */
+        c->wait = 0;
+        sort[0] = NULL;
+        
+        /* Start by generating the sort task. */
+        if ( c->count > 0 ) {
+        
+            if ( do_sort ) {
+                if ( c->count < 500 ) {
+                    sort[0] = space_addtask( s , tid_sort , 0x1fff , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    for ( k = 0 ; k < 13 ; k++ )
+                        c->sorts[k] = sort[0];
+                    nr_sort = 1;
+                    }
+                else if ( c->count < 2000 ) {
+                    sort[0] = space_addtask( s , tid_sort , 0xf , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    sort[1] = space_addtask( s , tid_sort , 0xf0 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    sort[2] = space_addtask( s , tid_sort , 0x1f00 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    for ( k = 0 ; k < 4 ; k++ )
+                        c->sorts[k] = sort[0];
+                    for ( k = 4 ; k < 8 ; k++ )
+                        c->sorts[k] = sort[1];
+                    for ( k = 8 ; k < 13 ; k++ )
+                        c->sorts[k] = sort[2];
+                    nr_sort = 3;
+                    }
+                else {
+                    c->sorts[0] = sort[0] = space_addtask( s , tid_sort , 0x1 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[1] = sort[1] = space_addtask( s , tid_sort , 0x2 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[2] = sort[2] = space_addtask( s , tid_sort , 0x4 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[3] = sort[3] = space_addtask( s , tid_sort , 0x8 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[4] = sort[4] = space_addtask( s , tid_sort , 0x10 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[5] = sort[5] = space_addtask( s , tid_sort , 0x20 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[6] = sort[6] = space_addtask( s , tid_sort , 0x40 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[7] = sort[7] = space_addtask( s , tid_sort , 0x80 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[8] = sort[8] = space_addtask( s , tid_sort , 0x100 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[9] = sort[9] = space_addtask( s , tid_sort , 0x200 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[10] = sort[10] = space_addtask( s , tid_sort , 0x400 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[11] = sort[11] = space_addtask( s , tid_sort , 0x800 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    c->sorts[12] = sort[12] = space_addtask( s , tid_sort , 0x1000 , 0 , c , NULL , sort_up , nr_sort_up , NULL , 0 );
+                    nr_sort = 13;
+                    }
+                }
+
+            /* Generate a self-interaction if not split. */
+            if ( !c->split && c->count > 1 )
+                space_addtask( s , tid_self , 0 , 0 , c , NULL , NULL , 0 , NULL , 0 );
+                
+            }
+            
+        /* Otherwise, add the interactions between progeny. */
+        if ( c->split ) {
+        
+            /* Recurse. */
+            for ( k = 0 ; k < 8 ; k++ )
+                if ( c->progeny[k] != NULL )
+                    maketasks_rec( c->progeny[k] , sort , nr_sort , c );
+                        
+            /* Worth splitting into several tasks? */
+            if ( c->count > 2*space_splitsize ) {
+            
+                /* Make a task for eac pair of progeny. */
+                for ( j = 0 ; j < 8 ; j++ )
+                    if ( c->progeny[j] != NULL && c->progeny[j]->count > 0 )
+                        for ( k = j + 1 ; k < 8 ; k++ )
+                            if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) {
+                                t = space_addtask( s , tid_pair , 0 , 0 , c->progeny[j] , c->progeny[k] , NULL , 0 , NULL , 0 );
+                                task_addunlock( c->progeny[j]->sorts[ pts[j][k] ] , t );
+                                task_addunlock( c->progeny[k]->sorts[ pts[j][k] ] , t );
+                                c->progeny[k]->nr_pairs += 1;
+                                c->progeny[j]->nr_pairs += 1;
+                                }
+                                
+                }
+                
+            /* Otherwise, dispatch as one large task. */
+            else {
+            
+                /* Add the task. */
+                t = space_addtask( s , tid_sub , 0 , 0 , c , NULL , NULL , 0 , NULL , 0 );
+                
+                /* Make it depend on all the sorts of its progeny. */
+                for ( k = 0 ; k < 8 ; k++ )
+                    for ( j = 0 ; j < 13 ; j++ )
+                        task_addunlock( c->progeny[k]->sorts[j] , t );
+            
+                }
+
+            }
+
+        }
+        
+    /* Allocate the task-list, if needed. */
+    if ( s->tasks == NULL )
+        if ( posix_memalign( (void *)&s->tasks , 64 , sizeof(struct task) * s->tot_cells * 14 ) != 0 )
+            error( "Failed to allocate task list." );
+    s->nr_tasks = 0;
+    
+    /* Loop over the cells and get their sub-tasks. */
+    for ( k = 0 ; k < s->nr_cells ; k++ )
+        maketasks_rec( &s->cells[k] , NULL , 0 , NULL );
+
+    /* Run through the highest level of cells and add pairs. */
+    for ( i = 0 ; i < cdim[0] ; i++ )
+        for ( j = 0 ; j < cdim[1] ; j++ )
+            for ( k = 0 ; k < cdim[2] ; k++ ) {
+                cid = cell_getid( cdim , i , j , k );
+                if ( s->cells[cid].count == 0 )
+                    continue;
+                for ( ii = -1 ; ii < 2 ; ii++ ) {
+                    iii = i + ii;
+                    if ( !s->periodic && ( iii < 0 || iii >= cdim[0] ) )
+                        continue;
+                    iii = ( iii + cdim[0] ) % cdim[0];
+                    for ( jj = -1 ; jj < 2 ; jj++ ) {
+                        jjj = j + jj;
+                        if ( !s->periodic && ( jjj < 0 || jjj >= cdim[1] ) )
+                            continue;
+                        jjj = ( jjj + cdim[1] ) % cdim[1];
+                        for ( kk = -1 ; kk < 2 ; kk++ ) {
+                            kkk = k + kk;
+                            if ( !s->periodic && ( kkk < 0 || kkk >= cdim[2] ) )
+                                continue;
+                            kkk = ( kkk + cdim[2] ) % cdim[2];
+                            cjd = cell_getid( cdim , iii , jjj , kkk );
+                            if ( s->cells[cjd].count == 0 )
+                                continue;
+                            if ( cid >= cjd )
+                                continue;
+                            t = space_addtask( s , tid_pair , 0 , 0 , &s->cells[cid] , &s->cells[cjd] , NULL , 0 , NULL , 0 );
+                            task_addunlock( s->cells[cid].sorts[ sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ] ] , t );
+                            task_addunlock( s->cells[cjd].sorts[ sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ] ] , t );
+                            s->cells[cid].nr_pairs += 1;
+                            s->cells[cjd].nr_pairs += 1;
+                            }
+                        }
+                    }
+                }
+
+    /* Split the tasks. */
+    space_splittasks( s );
+        
+    /* Did we already create indices? */
+    if ( s->tasks_ind == NULL )
+        if ( ( s->tasks_ind = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL )
+            error( "Failed to allocate task indices." );
+    
+    /* Did the number of tasks change, i.e. do we have to re-index? */
+    if ( nr_tasks_old != s->nr_tasks )
+        for ( k = 0 ; k < s->nr_tasks ; k++ )
+            s->tasks_ind[k] = k;
+            
+    /* Remove sort tasks with no dependencies. */
+    for ( k = 0 ; k < s->nr_tasks ; k++ ) {
+        t = &s->tasks[k];
+        if ( t->type == tid_sort && t->nr_unlock_tasks == 0 ) {
+            t->type = tid_none;
+            if ( t->ci->split )
+                for ( j = 0 ; j < 8 ; j++ )
+                    if ( t->ci->progeny[j] != NULL && t->flags & ( 1 << j ) )
+                        task_rmunlock( t->ci->progeny[j]->sorts[j] , t );
+            }
+        }
+            
+    /* Count the number of each task type. */
+    for ( k = 0 ; k < tid_count ; k++ )
+        counts[k] = 0;
+    for ( k = 0 ; k < s->nr_tasks ; k++ )
+        counts[ s->tasks[k].type ] += 1;
+    printf( "space_maketasks: task counts are [ %s=%i" , taskID_names[0] , counts[0] );
+    for ( k = 1 ; k < tid_count ; k++ )
+        printf( " %s=%i" , taskID_names[k] , counts[k] );
+    printf( " ]\n" );
+        
+    /* Re-set the next task pointer. */
+    s->next_task = 0;
+            
+    }
+    
+    
+/**
+ * @brief Sort the parts into eight bins along the given pivots.
+ *
+ * @param c The #cell array to be sorted.
+ */
+ 
+void cell_split ( struct cell *c  ) {
+
+    int i, j, k;
+    struct part temp, *parts = c->parts;
+    int left[8], right[8];
+    double pivot[3];
+    
+    /* Init the pivot. */
+    for ( k = 0 ; k < 3 ; k++ )
+        pivot[k] = c->loc[k] + c->h[k]/2;
+    
+    /* Split along the x-axis. */
+    i = 0; j = c->count - 1;
+    while ( i < j ) {
+        while ( i < c->count-1 && parts[i].x[0] <= pivot[0] )
+            i += 1;
+        while ( j > 0 && parts[j].x[0] > pivot[0] )
+            j -= 1;
+        if ( i < j ) {
+            temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
+            }
+        }
+    left[1] = i; right[1] = c->count - 1;
+    left[0] = 0; right[0] = j;
+    
+    /* Split along the y axis, twice. */
+    for ( k = 1 ; k >= 0 ; k-- ) {
+        i = left[k]; j = right[k];
+        while ( i < j ) {
+            while ( i < right[k] && parts[i].x[1] <= pivot[1] )
+                i += 1;
+            while ( j > left[k] && parts[j].x[1] > pivot[1] )
+                j -= 1;
+            if ( i < j ) {
+                temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
+                }
+            }
+        left[2*k+1] = i; right[2*k+1] = right[k];
+        left[2*k] = left[k]; right[2*k] = j;
+        }
+
+    /* Split along the z axis, four times. */
+    for ( k = 3 ; k >= 0 ; k-- ) {
+        i = left[k]; j = right[k];
+        while ( i < j ) {
+            while ( i < right[k] && parts[i].x[2] <= pivot[2] )
+                i += 1;
+            while ( j > left[k] && parts[j].x[2] > pivot[2] )
+                j -= 1;
+            if ( i < j ) {
+                temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
+                }
+            }
+        left[2*k+1] = i; right[2*k+1] = right[k];
+        left[2*k] = left[k]; right[2*k] = j;
+        }
+        
+    /* Store the counts and offsets. */
+    for ( k = 0 ; k < 8 ; k++ ) {
+        c->progeny[k]->count = right[k] - left[k] + 1;
+        if ( c->progeny[k]->count < 0 )
+            abort();
+        c->progeny[k]->parts = &c->parts[ left[k] ];
+        }
+        
+    /* Verify a few sub-cells. */
+    /* for ( k = 0 ; k < c->progeny[0]->count ; k++ )
+        if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
+             c->progeny[0]->parts[k].x[1] > pivot[1] ||
+             c->progeny[0]->parts[k].x[2] > pivot[2] )
+            error( "Sorting failed (progeny=0)." );
+    for ( k = 0 ; k < c->progeny[1]->count ; k++ )
+        if ( c->progeny[1]->parts[k].x[0] > pivot[0] ||
+             c->progeny[1]->parts[k].x[1] > pivot[1] ||
+             c->progeny[1]->parts[k].x[2] <= pivot[2] )
+            error( "Sorting failed (progeny=1)." );
+    for ( k = 0 ; k < c->progeny[2]->count ; k++ )
+        if ( c->progeny[2]->parts[k].x[0] > pivot[0] ||
+             c->progeny[2]->parts[k].x[1] <= pivot[1] ||
+             c->progeny[2]->parts[k].x[2] > pivot[2] )
+            error( "Sorting failed (progeny=2)." ); */
+
+    }
+
+
+/**
+ * @brief Split cells that contain too many particles.
+ *
+ * @param s The #space we are working in.
+ * @param c The #cell under consideration.
+ */
+ 
+void space_split ( struct space *s , struct cell *c ) {
+
+    int k, count;
+    double r, r_limit, r_max = 0.0;
+    struct cell *temp;
+    
+    /* Check the depth. */
+    if ( c->depth > s->maxdepth )
+        s->maxdepth = c->depth;
+    
+    /* Set the minimum cutoff. */
+    r_limit = fmin( c->h[0] , fmin( c->h[1] , c->h[2] ) ) / 2;
+    
+    /* Count the particles below that. */
+    for ( count = 0 , k = 0 ; k < c->count ; k++ ) {
+        r = c->parts[k].r;
+        if ( r <= r_limit )
+            count += 1;
+        if ( r > r_max )
+            r_max = r;
+        }
+    c->r_max = r_max;
+            
+    /* Split or let it be? */
+    if ( count > c->count*space_splitratio && c->count > space_splitsize ) {
+    
+        /* No longer just a leaf. */
+        c->split = 1;
+        
+        /* Create the cell's progeny. */
+        for ( k = 0 ; k < 8 ; k++ ) {
+            temp = space_getcell( s );
+            temp->count = 0;
+            temp->loc[0] = c->loc[0];
+            temp->loc[1] = c->loc[1];
+            temp->loc[2] = c->loc[2];
+            temp->h[0] = c->h[0]/2;
+            temp->h[1] = c->h[1]/2;
+            temp->h[2] = c->h[2]/2;
+            if ( k & 4 )
+                temp->loc[0] += temp->h[0];
+            if ( k & 2 )
+                temp->loc[1] += temp->h[1];
+            if ( k & 1 )
+                temp->loc[2] += temp->h[2];
+            temp->depth = c->depth + 1;
+            temp->split = 0;
+            temp->r_max = 0.0;
+            temp->parent = c;
+            c->progeny[k] = temp;
+            }
+            
+        /* Split the cell data. */
+        cell_split( c );
+            
+        /* Recurse? */
+        for ( k = 0 ; k < 8 ; k++ )
+            space_split( s , c->progeny[k] );
+            
+        /* Remove any progeny with zero parts. */
+        for ( k = 0 ; k < 8 ; k++ )
+            if ( c->progeny[k]->count == 0 ) {
+                space_recycle( s , c->progeny[k] );
+                c->progeny[k] = NULL;
+                }
+            
+        }
+        
+    /* Otherwise, set the progeny to null. */
+    else {
+        bzero( c->progeny , sizeof(struct cell *) * 8 );
+        c->split = 0;
+        }
+
+    }
+
+
+/**
+ * @brief Return a used cell to the cell buffer.
+ *
+ * @param s The #space.
+ * @param c The #cell.
+ */
+ 
+void space_recycle ( struct space *s , struct cell *c ) {
+
+    /* Clear the cell. */
+    if ( lock_destroy( &c->lock ) != 0 )
+        error( "Failed to destroy spinlock." );
+    
+    /* Hook this cell into the buffer. */
+    c->next = s->cells_new;
+    s->cells_new = c;
+    s->tot_cells -= 1;
+    
+    }
+
+
+/**
+ * @brief Get a new empty cell.
+ *
+ * @param s The #space.
+ */
+ 
+struct cell *space_getcell ( struct space *s ) {
+
+    struct cell *c;
+    int k;
+    
+    /* Is the buffer empty? */
+    if ( s->cells_new == NULL ) {
+        if ( posix_memalign( (void *)&s->cells_new , 64 , space_cellallocchunk * sizeof(struct cell) ) != 0 )
+            error( "Failed to allocate more cells." );
+        bzero( s->cells_new , space_cellallocchunk * sizeof(struct cell) );
+        for ( k = 0 ; k < space_cellallocchunk-1 ; k++ )
+            s->cells_new[k].next = &s->cells_new[k+1];
+        s->cells_new[ space_cellallocchunk-1 ].next = NULL;
+        }
+
+    /* Pick off the next cell. */
+    c = s->cells_new;
+    s->cells_new = c->next;
+    s->tot_cells += 1;
+    
+    /* Init some things in the cell. */
+    bzero( c , sizeof(struct cell) );
+    if ( lock_init( &c->lock ) != 0 )
+        error( "Failed to initialize cell spinlock." );
+        
+    return c;
+
+    }
+
+
+/**
+ * @brief Split the space into cells given the array of particles.
+ *
+ * @param The #space to initialize.
+ * @param dim Spatial dimensions of the domain.
+ * @param parts Pointer to an array of #part.
+ * @param N The number of parts in the space.
+ * @param periodic flag whether the domain is periodic or not.
+ *
+ * Makes a grid of edge length > r_max and fills the particles
+ * into the respective cells. Cells containing more than #space_maxppc
+ * parts with a cutoff below half the cell width are then split
+ * recursively.
+ */
+
+
+void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ) {
+
+    int i, j, k;
+    int nr_cells, cdim[3];
+    double r_min, r_max, h[3], ih[3];
+    struct cell *c, *cells;
+    struct part *parts_new, *finger;
+    
+    
+    /* Get the minimum and maximum cutoff radii. */
+    r_min = parts[0].r; r_max = r_min;
+    for ( k = 1 ; k < N ; k++ )
+        if ( parts[k].r < r_min )
+            r_min = parts[k].r;
+        else if ( parts[k].r > r_max )
+            r_max = parts[k].r;
+            
+    /* Get the cell width. */
+    if ( h_max < r_max )
+        h_max = r_max;
+    for ( k = 0 ; k < 3 ; k++ ) {
+        cdim[k] = ceil( dim[k] / h_max );
+        h[k] = dim[k] / cdim[k];
+        ih[k] = 1.0 / h[k];
+        }
+        
+    /* Allocate the highest level of cells. */
+    nr_cells = cdim[0] * cdim[1] * cdim[2];
+    if ( posix_memalign( (void *)&cells , 64 , nr_cells * sizeof(struct cell) ) != 0 )
+        error( "Failed to allocate cells." );
+    bzero( cells , nr_cells * sizeof(struct cell) );
+    for ( k = 0 ; k < nr_cells ; k++ )
+        if ( lock_init( &cells[k].lock ) != 0 )
+            error( "Failed to init spinlock." );
+        
+    /* Set the cell locations. */
+    for ( i = 0 ; i < cdim[0] ; i++ )
+        for ( j = 0 ; j < cdim[1] ; j++ )
+            for ( k = 0 ; k < cdim[2] ; k++ ) {
+                c = &cells[ cell_getid( cdim , i , j , k ) ];
+                c->loc[0] = i*h[0]; c->loc[1] = j*h[1]; c->loc[2] = k*h[2];
+                c->h[0] = h[0]; c->h[1] = h[1]; c->h[2] = h[2];
+                }
+        
+    /* Run through the particles and get the counts for each cell. */
+    for ( k = 0 ; k < N ; k++ )
+        cells[ cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ].count += 1;
+        
+    /* Allocate the new part buffer and set the part pointers in each cell. */
+    if ( posix_memalign( (void *)&parts_new , 64 , N * sizeof(struct part) ) != 0 )
+        error( "Failed to allocate parts." );
+    for ( finger = parts_new , k = 0 ; k < nr_cells ; k++ ) {
+        c = &cells[ k ];
+        c->parts = finger;
+        finger = &finger[ c->count ];
+        c->count = 0;
+        }
+    for ( k = 0 ; k < N ; k++ ) {
+        c = &cells[ cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ];
+        c->parts[ c->count ] = parts[k];
+        c->count += 1;
+        }
+        
+    /* Loop over the cells and split them. */
+    /* for ( k = 0 ; k < nr_cells ; k++ )
+        if ( cells[k].count > space_maxppc )
+            space_split( s , &cells[k] ); */
+        
+    /* Store eveything in the space. */
+    s->r_min = r_min; s->r_max = r_max;
+    s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2];
+    s->periodic = periodic;
+    s->parts = parts_new;
+    s->nr_parts = N;
+    s->h[0] = h[0]; s->h[1] = h[1]; s->h[2] = h[2];
+    s->ih[0] = ih[0]; s->ih[1] = ih[1]; s->ih[2] = ih[2];
+    s->cdim[0] = cdim[0]; s->cdim[1] = cdim[1]; s->cdim[2] = cdim[2];
+    s->cells = cells;
+    s->nr_cells = nr_cells;
+    s->tot_cells = nr_cells;
+    if ( lock_init( &s->task_lock ) != 0 )
+        error( "Failed to create task spin-lock." );
+    
+    /* Loop over the cells and split them. */
+    for ( k = 0 ; k < nr_cells ; k++ )
+        space_split( s , &cells[k] );
+            
+    }
+
diff --git a/space.h b/space.h
new file mode 100644
index 0000000000000000000000000000000000000000..a75e5e7ebe1ffa9c65c48d121abe5f6ec5dc9df3
--- /dev/null
+++ b/space.h
@@ -0,0 +1,177 @@
+
+
+/* Some constants. */
+#define space_maxdepth                  10
+#define space_cellallocchunk            1000
+#define space_splitratio                0.875
+#define space_splitsize                 800
+#define task_maxwait                    3
+#define task_maxunlock                  39
+
+
+/* Map shift vector to sortlist. */
+extern const int sortlistID[27];
+    
+    
+/* The different task IDs. */
+enum taskIDs {
+    tid_none = 0,
+    tid_sort,
+    tid_self,
+    tid_pair,
+    tid_sub,
+    tid_count
+    };
+extern const char *taskID_names[];
+    
+    
+/* Data of a task. */
+struct task {
+
+    int type, flags, wait, rank, done;
+    
+    int nr_unlock_tasks;
+    struct task *unlock_tasks[ task_maxunlock ];
+
+    int nr_unlock_cells;
+    struct cell *ci, *cj, *unlock_cells[2];
+    
+    } __attribute__((aligned (64)));
+
+
+/* Entry in a list of sorted indices. */
+struct entry {
+    float d;
+    int i;
+    };
+    
+    
+/* Data of a single particle. */
+struct part {
+
+    /* Particle position. */
+    double x[3];
+    
+    /* Particle cutoff radius. */
+    float r;
+    
+    /* Particle time-step. */
+    float dt;
+    
+    /* Particle ID. */
+    int id;
+    
+    /* Number of pairwise interactions. */
+    float count;
+    int icount;
+    
+    } __attribute__((aligned (32)));
+    
+
+/* Structure to store the data of a single cell. */
+struct cell {
+
+    /* The cell location on the grid. */
+    double loc[3];
+    
+    /* The cell dimensions. */
+    double h[3];
+    
+    /* Max radii in this cell. */
+    double r_max;
+    
+    /* The depth of this cell in the tree. */
+    int depth, split;
+    
+    /* Nr of parts. */
+    int count;
+    
+    /* Pointers to the particle data. */
+    struct part *parts;
+    
+    /* Pointers for the sorted indices. */
+    struct entry *sort;
+    
+    /* Number of pairs associated with this cell. */
+    int nr_pairs;
+    
+    /* Pointers to the next level of cells. */
+    struct cell *progeny[8];
+    
+    /* Parent cell. */
+    struct cell *parent;
+    
+    /* The tasks computing this cell's sorts. */
+    struct task *sorts[14];
+    
+    /* Number of tasks this cell is waiting for and whether it is in use. */
+    int wait;
+    
+    /* Is the data of this cell being used in a sub-cell? */
+    int hold;
+    
+    /* Spin lock for various uses. */
+    lock_type lock;
+    
+    /* Linking pointer for "memory management". */
+    struct cell *next;
+
+    } __attribute__((aligned (64)));
+
+
+/* The space in which the cells reside. */
+struct space {
+
+    /* Spatial extent. */
+    double dim[3];
+    
+    /* Cell widths. */
+    double h[3], ih[3];
+    
+    /* The minimum and maximum cutoff radii. */
+    double r_min, r_max;
+    
+    /* Number of cells. */
+    int nr_cells, tot_cells;
+    
+    /* Space dimensions in number of cells. */
+    int maxdepth, cdim[3];
+    
+    /* The (level 0) cells themselves. */
+    struct cell *cells;
+    
+    /* Buffer of unused cells. */
+    struct cell *cells_new;
+    
+    /* The particle data (cells have pointers to this). */
+    struct part *parts;
+    
+    /* The sortlist data (cells hold pointers to these). */
+    struct entry *sortlist;
+    
+    /* The total number of parts in the space. */
+    int nr_parts;
+    
+    /* Is the space periodic? */
+    int periodic;
+    
+    /* The list of tasks. */
+    struct task *tasks;
+    int nr_tasks, next_task;
+    int *tasks_ind;
+    lock_type task_lock;
+    
+    };
+
+
+/* function prototypes. */
+struct cell *space_getcell ( struct space *s );
+struct task *space_gettask ( struct space *s );
+void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max );
+void space_maketasks ( struct space *s , int do_sort );
+void space_map_cells ( struct space *s , void (*fun)( struct cell *c , void *data ) , void *data );
+void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data );
+void space_recycle ( struct space *s , struct cell *c );
+
+
+
diff --git a/test.c b/test.c
new file mode 100644
index 0000000000000000000000000000000000000000..f22ed69dae37300c64745eaf31645e2cce2e7dbc
--- /dev/null
+++ b/test.c
@@ -0,0 +1,545 @@
+
+/* Some standard headers. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <math.h>
+#include <omp.h>
+
+/* Local headers. */
+#define INLINE
+#include "cycle.h"
+#include "lock.h"
+#include "space.h"
+#include "runner.h"
+
+/* Ticks per second on this machine. */
+#ifndef CPU_TPS
+    #define CPU_TPS 2.67e9
+#endif
+
+/* Error macro. */
+#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); }
+
+
+/**
+ * @brief Mapping function to draw a specific cell (gnuplot).
+ */
+
+void map_cells_plot ( struct cell *c , void *data ) {
+
+    int k, depth = *(int *)data;
+    double *l = c->loc, *h = c->h;
+
+    if ( c->depth >= depth ) {
+    
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n\n\n" , l[0] , l[1]+h[1] , l[2] );
+    
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n\n\n" , l[0] , l[1]+h[1] , l[2]+h[2] );
+    
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n\n\n" , l[0] , l[1] , l[2]+h[2] );
+    
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n\n\n" , l[0]+h[0] , l[1] , l[2] +h[2]);
+    
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n\n\n" , l[0]+h[0] , l[1] , l[2] );
+    
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0] , l[1]+h[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2]+h[2] );
+        printf( "%.16e %.16e %.16e\n\n\n" , l[0]+h[0] , l[1]+h[1] , l[2] );
+        
+        for ( k = 0 ; k < c->count ; k++ )
+            printf( "%.16e %.16e %.16e %.16e %.16e %.16e\n" , l[0]+h[0] , l[1]+h[1] , l[2] ,
+                c->parts[k].x[0] , c->parts[k].x[1] , c->parts[k].x[2] );
+        printf( "\n\n" );
+    
+        }
+
+    }
+
+
+/**
+ * @brief Mapping function for checking if each part is in its box.
+ */
+
+/* void map_check ( struct part *p , struct cell *c , void *data ) {
+
+    if ( p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
+         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ||
+         p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] )
+        printf( "map_check: particle %i is outside of its box.\n" , p->id );
+
+    } */
+
+
+/**
+ * @brief Mapping function for neighbour count.
+ */
+
+void map_count ( struct part *p , struct cell *c , void *data ) {
+
+    double *count = (double *)data;
+    
+    // printf( "%e\n" , p->count );
+
+    *count += p->count;
+
+    }
+
+
+/**
+ * @brief Mapping function for neighbour count.
+ */
+
+void map_icount ( struct part *p , struct cell *c , void *data ) {
+
+    int *count = (int *)data;
+    
+    // printf( "%i\n" , p->icount );
+
+    *count += p->icount;
+
+    }
+
+
+/**
+ * @brief Mapping function to print the particle position.
+ */
+
+void map_dump ( struct part *p , struct cell *c , void *data ) {
+
+    double *shift = (double *)data;
+
+    printf( "%g\t%g\t%g\n" , p->x[0]-shift[0] , p->x[1]-shift[1] , p->x[2]-shift[2] );
+
+    }
+
+
+/**
+ * @brief Read coordinates from a text file.
+ *
+ * @param fname The name of the coordinate file.
+ * @param parts An array of #part in which to store the coordinates.
+ * @param N The number of parts to read.
+ */
+ 
+void read_coords ( char *fname , struct part *parts , int N ) {
+
+    FILE *fd;
+    int k;
+    
+    /* Open the given file. */
+    if ( ( fd = fopen( fname , "r" ) ) == NULL )
+        error( "Failed to open coordinate file" );
+        
+    /* Read the coordinates into the part positions. */
+    for ( k = 0 ; k < N ; k++ ) {
+        if ( fscanf( fd , "%lf %lf %lf" , &parts[k].x[0] , &parts[k].x[1] , &parts[k].x[2] ) != 3 ) {
+            printf( "read_coords: failed to read %ith entry.\n" , k );
+            error( "Error reading coordinate file." );
+            }
+        }
+
+    }
+
+
+/**
+ * @brief Read cutoffs from a text file.
+ *
+ * @param fname The name of the cutoffs file.
+ * @param parts An array of #part in which to store the cutoffs.
+ * @param N The number of parts to read.
+ */
+ 
+void read_cutoffs ( char *fname , struct part *parts , int N ) {
+
+    FILE *fd;
+    int k;
+    
+    /* Open the given file. */
+    if ( ( fd = fopen( fname , "r" ) ) == NULL )
+        error( "Failed to open cutoff file" );
+        
+    /* Read the coordinates into the part positions. */
+    for ( k = 0 ; k < N ; k++ ) {
+        if ( fscanf( fd , "%ef" , &parts[k].r ) != 1 ) {
+            printf( "read_cutoffs: failed to read %ith entry.\n" , k );
+            error( "Error reading cutoff file." );
+            }
+        }
+
+    }
+    
+    
+/**
+ * @brief Read id from a text file.
+ *
+ * @param fname The name of the id file.
+ * @param parts An array of #part in which to store the dt.
+ * @param N The number of parts to read.
+ */
+ 
+void read_id ( char *fname , struct part *parts , int N ) {
+
+    FILE *fd;
+    int k;
+    
+    /* Open the given file. */
+    if ( ( fd = fopen( fname , "r" ) ) == NULL )
+        error( "Failed to open id file" );
+        
+    /* Read the coordinates into the part positions. */
+    for ( k = 0 ; k < N ; k++ ) {
+        if ( fscanf( fd , "%i" , &parts[k].id ) != 1 ) {
+            printf( "read_id: failed to read %ith entry.\n" , k );
+            error( "Error reading id file." );
+            }
+        }
+
+    }
+    
+    
+/**
+ * @brief Read dt from a text file.
+ *
+ * @param fname The name of the dt file.
+ * @param parts An array of #part in which to store the dt.
+ * @param N The number of parts to read.
+ */
+ 
+void read_dt ( char *fname , struct part *parts , int N ) {
+
+    FILE *fd;
+    int k;
+    
+    /* Open the given file. */
+    if ( ( fd = fopen( fname , "r" ) ) == NULL )
+        error( "Failed to open dt file" );
+        
+    /* Read the coordinates into the part positions. */
+    for ( k = 0 ; k < N ; k++ ) {
+        if ( fscanf( fd , "%ef" , &parts[k].dt ) != 1 )
+            error( "Error reading dt file." );
+        }
+
+    }
+    
+    
+/**
+ * @brief Compute the average number of pairs per particle using
+ *      a brute-force O(N^2) computation.
+ *
+ * @param dim The space dimensions.
+ * @param parts The #part array.
+ * @param N The number of parts.
+ * @param periodic Periodic boundary conditions flag.
+ */
+
+void pairs_n2 ( double *dim , struct part *__restrict__ parts , int N , int periodic ) {
+
+    int i, j, k, count = 0, mj, mk;
+    double r, dx[3], dcount = 0.0, maxratio = 1.0;
+    float f1, f2;
+    
+    /* Loop over all particle pairs. */
+    #pragma omp parallel for schedule(dynamic), default(none), private(k,i,dx,r,f1,f2), shared(maxratio,mj,mk,periodic,parts,dim,N,stdout), reduction(+:count,dcount)
+    for ( j = 0 ; j < N ; j++ ) {
+        if ( j % 1000 == 0 ) {
+            printf( "pairs_n2: j=%i, count=%i.\n" , j , count );
+            fflush(stdout);
+            }
+        for ( k = j+1 ; k < N ; k++ ) {
+            for ( i = 0 ; i < 3 ; i++ ) {
+                dx[i] = parts[j].x[i] - parts[k].x[i];
+                if ( periodic ) {
+                    if ( dx[i] < -dim[i]/2 )
+                        dx[i] += dim[i];
+                    else if ( dx[i] > dim[i]/2 )
+                        dx[i] -= dim[i];
+                    }
+                }
+            r = sqrt( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] );
+            if ( r < parts[j].r || r < parts[k].r ) {
+                f1 = 0.0; f2 = 0.0;
+                iact( r*r , parts[j].r , parts[k].r , &f1 , &f2 , &count , &count );
+                dcount += f1 + f2;
+                if ( parts[j].r / parts[k].r > maxratio )
+                    #pragma omp critical
+                    {
+                    maxratio = parts[j].r / parts[k].r;
+                    mj = j; mk = k;
+                    }
+                else if ( parts[k].r / parts[j].r > maxratio )
+                    #pragma omp critical
+                    {
+                    maxratio = parts[k].r / parts[j].r;
+                    mj = j; mk = k;
+                    }
+                }
+            }
+        }
+            
+    /* Dump the result. */
+    printf( "pairs_n2: avg. nr. of pairs per part is %.3f (%.3f).\n" , ((double)count)/N , dcount/N + 32.0/3 );
+    printf( "pairs_n2: maximum ratio between parts %i [%e,%e,%e] and %i [%e,%e,%e] is %.3f/%.3f\n" ,
+        mj , parts[mj].x[0] , parts[mj].x[1] , parts[mj].x[2] ,
+        mk , parts[mk].x[0] , parts[mk].x[1] , parts[mk].x[2] ,
+        parts[mj].r , parts[mk].r ); fflush(stdout);
+    fflush(stdout);
+            
+    }
+
+
+/**
+ * @brief Find the pairs of a single particle
+ *
+ * @param dim The space dimensions.
+ * @param parts The #part array.
+ * @param N The number of parts.
+ * @param periodic Periodic boundary conditions flag.
+ * @param target the index of the target particle.
+ */
+
+void pairs_single ( double *dim , struct part *__restrict__ parts , int N , int periodic , int target ) {
+
+    int i, k, tid;
+    double r, tx[3], tr, dx[3];
+    
+    /* Get the target position and radius. */
+    for ( k = 0 ; k < 3 ; k++ )
+        tx[k] = parts[target].x[k];
+    tr = parts[target].r;
+    tid = parts[target].id;
+    
+    /* Loop over all particle pairs. */
+    #pragma omp parallel for schedule(dynamic), default(none), private(k,i,dx,r), shared(target,tx,tr,tid,periodic,parts,dim,N)
+    for ( k = 0 ; k < N ; k++ ) {
+        if ( k == target )
+            continue;
+        for ( i = 0 ; i < 3 ; i++ ) {
+            dx[i] = tx[i] - parts[k].x[i];
+            if ( periodic ) {
+                if ( dx[i] < -dim[i]/2 )
+                    dx[i] += dim[i];
+                else if ( dx[i] > dim[i]/2 )
+                    dx[i] -= dim[i];
+                }
+            }
+        r = sqrt( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] );
+        if ( r < tr )
+            printf( "pairs_single: %i %i [%e,%e,%e] %e\n" ,
+                tid , parts[k].id , dx[0] , dx[1] , dx[2] , r );
+        }
+            
+    }
+
+
+/**
+ * @brief Main routine that loads a few particles and generates some output.
+ *
+ */
+ 
+int main ( int argc , char *argv[] ) {
+
+    int c, icount, j, k, N = 100, periodic = 1, nr_threads = 1, nr_queues = -1, runs = 1;
+    double dim[3] = { 1.0 , 1.0 , 1.0 }, shift[3] = { 0.0 , 0.0 , 0.0 };
+    double r_min = 0.01, r_max = 0.1, h_max = -1.0 , scaling = 1.0, count = 0.0;
+    struct part *parts = NULL;
+    struct space s;
+    struct runner r;
+    ticks tic;
+    
+    /* Init the space. */
+    bzero( &s , sizeof(struct space) );
+    
+    /* Parse the options. */
+    while ( ( c = getopt( argc , argv  , "a:b:p:d:N:c:h:v:m:s:t:q:r:i:m:" ) ) != -1 )
+        switch ( c ) {
+            case 'N':
+                if ( sscanf( optarg , "%d" , &N ) != 1 )
+                    error( "Error parsing number of particles." );
+                if ( posix_memalign( (void *)&parts , 16 , N * sizeof(struct part) ) != 0 )
+                    error( "Call to posix_memalign failed." );
+                for ( k = 0 ; k < N ; k++ ) {
+                    parts[k].x[0] = ((double)rand()) / RAND_MAX * dim[0];
+                    parts[k].x[1] = ((double)rand()) / RAND_MAX * dim[1];
+                    parts[k].x[2] = ((double)rand()) / RAND_MAX * dim[2];
+                    parts[k].r = r_min + ((r_max - r_min)*rand())/RAND_MAX;
+                    }
+                printf( "main: allocated memory for %i parts.\n" , N ); fflush(stdout);
+                break;
+            case 'a':
+                if ( sscanf( optarg , "%lf" , &scaling ) != 1 )
+                    error( "Error parsing cutoff scaling." );
+                printf( "main: scaling cutoff by %.3f.\n" , scaling ); fflush(stdout);
+                for ( k = 0 ; k < N ; k++ )
+                    parts[k].r *= scaling;
+                break;
+            case 'b':
+                if ( sscanf( optarg , "%lf %lf %lf" , &dim[0] , &dim[1] , &dim[2] ) != 3 )
+                    error( "Error parsing box dimensions." );
+                break;
+            case 'c':
+                printf( "main: reading parts from %s...\n" , optarg ); fflush(stdout);
+                if ( parts == NULL && posix_memalign( (void *)&parts , 16 , N * sizeof(struct part) ) != 0 )
+                    error( "Call to calloc failed." );
+                read_coords( optarg , parts , N );
+                break;
+            case 'd':
+                printf( "main: reading dt from %s...\n" , optarg ); fflush(stdout);
+                read_dt( optarg , parts , N );
+                break;
+            case 'h':
+                printf( "main: reading cutoffs from %s...\n" , optarg ); fflush(stdout);
+                read_cutoffs( optarg , parts , N );
+                break;
+            case 'i':
+                printf( "main: reading ids from %s...\n" , optarg ); fflush(stdout);
+                read_id( optarg , parts , N );
+                break;
+            case 'm':
+                if ( sscanf( optarg , "%lf" , &h_max ) != 1 )
+                    error( "Error parsing h_max." );
+                printf( "main: maximum h set to %e.\n" , h_max );
+                break;
+            case 'p':
+                if ( sscanf( optarg , "%d" , &periodic ) != 1 )
+                    error( "Error parsing periodicity." );
+                printf( "main: periodicity switched %s.\n" , periodic ? "on" : "off" );
+                break;
+            case 'q':
+                if ( sscanf( optarg , "%d" , &nr_queues ) != 1 )
+                    error( "Error parsing number of queues." );
+                break;
+            case 'r':
+                if ( sscanf( optarg , "%d" , &runs ) != 1 )
+                    error( "Error parsing number of runs." );
+                break;
+            case 's':
+                if ( sscanf( optarg , "%lf %lf %lf" , &shift[0] , &shift[1] , &shift[2] ) != 3 )
+                    error( "Error parsing shift." );
+                for ( k = 0 ; k < N ; k++ ) {
+                    parts[k].x[0] += shift[0];
+                    parts[k].x[1] += shift[1];
+                    parts[k].x[2] += shift[2];
+                    }
+                printf( "main: shifted parts by [ %.3f %.3f %.3f ].\n" , shift[0] , shift[1] , shift[2] );
+                break;
+            case 't':
+                if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
+                    error( "Error parsing number of threads." );
+                omp_set_num_threads( nr_threads );
+                break;
+            case '?':
+                error( "Unknown option." );
+                break;
+            }
+    
+    /* Get the brute-force number of pairs. */
+    // pairs_n2( dim , parts , N , periodic );
+    // pairs_single( dim , parts , N , periodic , 63628 );
+    fflush( stdout );
+    
+    /* Set default number of queues. */
+    if ( nr_queues < 0 )
+        nr_queues = nr_threads;
+            
+    /* Initialize the space with this data. */
+    tic = getticks();
+    space_init( &s , dim , parts , N , periodic , h_max );
+    printf( "main: space_init took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout);
+    
+    /* Say a few nice things about the space we just created. */
+    printf( "main: space dimensions are [ %.3f %.3f %.3f ].\n" , s.dim[0] , s.dim[1] , s.dim[2] );
+    printf( "main: space %s periodic.\n" , s.periodic ? "is" : "isn't" );
+    printf( "main: highest-level cell dimensions are [ %i %i %i ].\n" , s.cdim[0] , s.cdim[1] , s.cdim[2] );
+    printf( "main: %i parts in %i cells.\n" , s.nr_parts , s.tot_cells );
+    printf( "main: maximum depth is %d.\n" , s.maxdepth );
+    printf( "main: cutoffs in [ %.3f %.3f ].\n" , s.r_min , s.r_max ); fflush(stdout);
+    
+    /* Dump the particle positions. */
+    // space_map_parts( &s , &map_dump , shift );
+    
+    /* Generate the tasks. */
+    tic = getticks();
+    space_maketasks( &s , 1 );
+    printf( "main: generated %i tasks.\n" , s.nr_tasks );
+    printf( "main: space_maketasks took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout);
+    
+    /* Initialize the runner with this space. */
+    tic = getticks();
+    runner_init( &r , &s , nr_threads , nr_queues , runner_policy_steal | runner_policy_keep );
+    printf( "main: runner_init took %.3f ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout);
+    
+    /* Let loose a runner on the space. */
+    for ( j = 0 ; j < runs ; j++ ) {
+        printf( "main: starting run %i/%i with %i threads and %i queues...\n" , j+1 , runs , r.nr_threads , r.nr_queues ); fflush(stdout);
+        tic = getticks();
+        #ifdef TIMER
+            for ( k = 0 ; k < runner_timer_count ; k++ )
+                runner_timer[k] = 0;
+        #endif
+        #ifdef COUNTER
+            for ( k = 0 ; k < runner_counter_count ; k++ )
+                runner_counter[k] = 0;
+        #endif
+        runner_run( &r , 0 );
+        #ifdef TIMER
+            printf( "main: runner timers are [ %.3f" , runner_timer[0]/CPU_TPS*1000 );
+            for ( k = 1 ; k < runner_timer_count ; k++ )
+                printf( " %.3f" , ((double)runner_timer[k])/CPU_TPS*1000 );
+            printf( " %.3f ] ms.\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 );
+        #else
+            printf( "main: runner_run with %i threads took %.3f ms.\n" , nr_threads , ((double)(getticks() - tic)) / CPU_TPS * 1000 );
+        #endif
+        #ifdef COUNTER
+            printf( "main: runner counters are [ %d" , runner_counter[0] );
+            for ( k = 1 ; k < runner_counter_count ; k++ )
+                printf( " %d" , runner_counter[k] );
+            printf( " ].\n" );
+        #endif
+        printf( "main: runner queue lengths are [ %i" , r.queues[0].count );
+        for ( k = 1 ; k < r.nr_queues ; k++ )
+            printf( " %i" , r.queues[k].count );
+        printf( " ].\n" );
+        fflush(stdout);
+        }
+        
+    /* Get the average interactions per particle. */
+    count = 0;
+    space_map_parts( &s , &map_count , &count );
+    printf( "main: average interactions per particle is %.3f.\n" , count / s.nr_parts + 32.0/3 );
+    
+    /* Get the average interactions per particle. */
+    icount = 0;
+    space_map_parts( &s , &map_icount , &icount );
+    printf( "main: average neighbours per particle is %.3f.\n" , (double)icount / s.nr_parts );
+    
+    /* Get all the cells of a certain depth. */
+    /* count = 11;
+    space_map_cells( &s , &map_cells_plot , &count ); */
+    
+    /* Check for outliers. */
+    // space_map_parts( &s , &map_check , NULL );
+    
+    /* All is calm, all is bright. */
+    return 0;
+    
+    }
diff --git a/test_mc.sh b/test_mc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..11da8c39de578738b59ce152ce1102d5227e3cb8
--- /dev/null
+++ b/test_mc.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Set the default OpenMP behaviour
+export OMP_WAIT_POLICY=PASSIVE
+
+# Clear up the memory first
+# ./memhog `free -g | grep Mem | awk '{print int(0.9*$2)}'`
+
+# loop over number of CPUs
+for cpu in {1..32}
+do
+
+    # Set some environment variables for OpenMP
+    export OMP_NUM_THREADS=$cpu
+    export OMP_THREAD_LIMIT=$cpu
+    export OMP_PROC_BIND=TRUE
+    
+    # ./test -r 100 -t $cpu -q $cpu -b "25000 25000 25000" -N 149646 -c data/Coordinates.txt -s "12500 12500 12500" -h data/SmoothingLength.txt > test_${cpu}.dump
+
+    ./test -r 100 -t $cpu -q $cpu -b "100 100 100" -N 3664514 -c data2/Coordinates.txt -h data2/SmoothingLength.txt > test2_${cpu}.dump
+
+    # ./test -r 100 -t $cpu -q $cpu -b "140 140 140" -N 7741 -c shrink/Coordinates.txt -s "70 70 70" -h shrink/SmoothingLength.txt > shrink_${cpu}.dump
+
+done