diff --git a/examples/Makefile.am b/examples/Makefile.am
index 9708f27ed737f34bc9bb425d4b6bb0529b8feb8a..11013c9414642beb4672e8cec435854ef39507ec 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -24,6 +24,9 @@ AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \
     -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
     -DCPU_TPS=2.67e9 -DTIMERS \
     # -fsanitize=address -fno-omit-frame-pointer
+# AM_CFLAGS = -g -O0 -Wall -Werror -I../src \
+#     -DCPU_TPS=2.67e9 -DTIMERS $(OPENMP_CFLAGS) \
+#     -fsanitize=address -fno-omit-frame-pointer
 
 AM_LDFLAGS = -lm # -fsanitize=address
 
diff --git a/examples/mkplots_tasks.m b/examples/mkplots_tasks.m
index 9812bfa5a82adc111866e0065e0c9f868524e175..ba585eb8d4c61435a652a28e190b895f3fd914b7 100644
--- a/examples/mkplots_tasks.m
+++ b/examples/mkplots_tasks.m
@@ -72,9 +72,9 @@ end
 hold off;
 xlabel('time (ms)');
 ylabel('core ID');
-set(gca,'YTick',0:(max(tasks(:,2))))
+set(gca,'YTick',0:8:(max(tasks(:,2))))
 title('QuickSched tiled QR decomposition tasks');
-axis([ 0 , 110 , -0.5 , nr_cores-0.5 ]);
+axis([ 0 , 250 , -0.5 , nr_cores-0.5 ]);
 
 % Print this plot
 set( gcf , 'PaperSize' , 2.3*[ 16 4 ] );
@@ -108,9 +108,9 @@ end
 hold off;
 xlabel('time (ms)');
 ylabel('core ID');
-set(gca,'YTick',0:(max(tasks(:,1))))
+set(gca,'YTick',0:8:(max(tasks(:,1))))
 title('OmpSs tiled QR decomposition tasks');
-axis([ 0 , 110 , -0.5 , nr_cores-0.5 ]);
+axis([ 0 , 250 , -0.5 , nr_cores-0.5 ]);
 
 % Print this plot
 set( gcf , 'PaperSize' , 2.3*[ 16 4 ] );
@@ -123,7 +123,7 @@ print -depsc2 figures/tasks_qr_ompss.eps
 
 %% Plot the task timelines for tasks allocation
 % Load the data
-tasks = importdata( 'test.dump' );
+tasks = dlmread( 'test_bh_sorted.tasks' );
 tasks(:,4) = ( tasks(:,4) - tasks(:,3) ) * tpms;
 start = min( tasks(:,3) );
 tasks(:,3) = ( tasks(:,3) - start ) * tpms;
@@ -132,7 +132,11 @@ nr_cores = max( tasks(:,2) ) + 1;
 % Init the plot
 clf;
 subplot('position',[ 0.05 , 0.1 , 0.9 , 0.8 ]);
-colours = [ 1 0 0 ; 0 1 0 ; 0 0 1 ; 1 1 0 ];
+colours = [ 1 0 0 ; 0 1 0 ; 0 0 1 ; 1 1 0 ; 0 1 1 ];
+xlabel('time (ms)');
+ylabel('core ID');
+title('Barnes-Hut tasks');
+axis([ 0 , max( tasks(:,3) + tasks(:,4) ) , -0.5 , nr_cores-0.5 ]);
 hold on;
 
 % Plot the tasks
@@ -144,11 +148,6 @@ end
 
 % Set the axes and stuff.
 hold off;
-xlabel('time (ms)');
-ylabel('core ID');
-set(gca,'YTick',0:(max(tasks(:,1))))
-title('Barnes-Hut tasks');
-axis([ 0 , max( tasks(:,3) + tasks(:,4) ) , -0.5 , nr_cores-0.5 ]);
 
 % Print this plot
 set( gcf , 'PaperSize' , 2.3*[ 16 4 ] );
diff --git a/examples/test_bh.c b/examples/test_bh.c
deleted file mode 100644
index 7c563a74a38046038e6ba5d5b865642d935ab8a2..0000000000000000000000000000000000000000
--- a/examples/test_bh.c
+++ /dev/null
@@ -1,847 +0,0 @@
-/*******************************************************************************
- * This file is part of QuickSched.
- * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
- ******************************************************************************/
-
-
-/* Config parameters. */
-#include "../config.h"
-
-/* Standard includes. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <math.h>
-#include <omp.h>
-
-/* Local includes. */
-#include "quicksched.h"
-
-
-/* Some local constants. */
-#define cell_pool_grow 100
-#define cell_maxparts 1
-#define task_limit 5000
-#define const_G 6.6738e-11
-#define dist_min 0.5
-
-
-/** Data structure for the particles. */
-struct part {
-    double x[3];
-    double a[3];
-    double mass;
-    int id;
-    };
-   
-    
-/** Data structure for the BH tree cell. */
-struct cell {
-    double loc[3];
-    double h[3];
-    int split, count;
-    struct part *parts;
-    struct cell *progeny[8];
-    struct cell *parent;
-    double com[3];
-    double mass;
-    int res, com_tid, depth;
-    };
-    
-    
-/** Task types. */
-enum task_type {
-    task_type_self = 0,
-    task_type_pair,
-    task_type_pair_pc,
-    task_type_com,
-    task_type_count
-    };
-    
-/** Per-type timers. */
-ticks task_timers[ task_type_count ];
-    
-    
-/** Global variable for the pool of allocated cells. */
-struct cell *cell_pool = NULL;
-
-
-/**
- * @brief Get a #cell from the pool.
- */
- 
-struct cell *cell_get ( ) {
-
-    struct cell *res;
-    int k;
-
-    /* Allocate a new batch? */
-    if ( cell_pool == NULL ) {
-    
-        /* Allocate the cell array. */
-        if ( ( cell_pool = (struct cell *)malloc( sizeof(struct cell) * cell_pool_grow ) ) == NULL )
-            error( "Failed to allocate fresh batch of cells." );
-            
-        /* Clear the cells. */
-        bzero( cell_pool , sizeof(struct cell) * cell_pool_grow );
-        
-        /* Link them up via their progeny pointers. */
-        for ( k = 1 ; k < cell_pool_grow ; k++ )
-            cell_pool[k-1].progeny[0] = &cell_pool[k];
-    
-        }
-        
-    /* Pick a cell off the pool. */
-    res = cell_pool;
-    cell_pool = cell_pool->progeny[0];
-    
-    /* Clean up a few things. */
-    res->res = qsched_res_none;
-    
-    /* Return the cell. */
-    return res;
-
-    }
-    
-    
-/**
- * @brief Sort the parts into eight bins along the given pivots and
- *        fill the multipoles. Also adds the hierarchical resources
- *        to the sched.
- *
- * @param c The #cell to be split.
- * @param N The total number of parts.
- * @param s The #sched to store the resources.
- */
- 
-void cell_split ( struct cell *c , struct qsched *s ) {
-
-    int i, j, k, count = c->count;
-    struct part temp, *parts = c->parts;
-    struct cell *cp;
-    int left[8], right[8];
-    double pivot[3];
-    static struct cell *root = NULL;
-    
-    /* Set the root cell. */
-    if ( root == NULL ) {
-        root = c;
-        c->depth = 0;
-        }
-    
-    /* Add a resource for this cell if it doesn't have one yet. */
-    if ( c->res == qsched_res_none ) 
-        c->res = qsched_addres( s , qsched_owner_none , qsched_res_none );
-    
-    /* Attach a center-of-mass task to the cell. */
-    c->com_tid = qsched_addtask( s , task_type_com , task_flag_none , &c , sizeof(struct cell *) , 1 );
-    
-    /* Does this cell need to be split? */
-    if ( count > cell_maxparts ) {
-    
-        /* Mark this cell as split. */
-        c->split = 1;
-    
-        /* Create the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            c->progeny[k] = cp = cell_get();
-            cp->parent = c;
-            cp->depth = c->depth + 1;
-            cp->loc[0] = c->loc[0];
-            cp->loc[1] = c->loc[1];
-            cp->loc[2] = c->loc[2];
-            cp->h[0] = c->h[0]/2;
-            cp->h[1] = c->h[1]/2;
-            cp->h[2] = c->h[2]/2;
-            cp->res = qsched_addres( s , qsched_owner_none , c->res );
-            if ( k & 4 )
-                cp->loc[0] += cp->h[0];
-            if ( k & 2 )
-                cp->loc[1] += cp->h[1];
-            if ( k & 1 )
-                cp->loc[2] += cp->h[2];
-            }
-    
-        /* Init the pivots. */
-        for ( k = 0 ; k < 3 ; k++ )
-            pivot[k] = c->loc[k] + c->h[k]/2;
-
-        /* Split along the x-axis. */
-        i = 0; j = count - 1;
-        while ( i <= j ) {
-            while ( i <= count-1 && parts[i].x[0] < pivot[0] )
-                i += 1;
-            while ( j >= 0 && parts[j].x[0] >= pivot[0] )
-                j -= 1;
-            if ( i < j ) {
-                temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                }
-            }
-        left[1] = i; right[1] = count - 1;
-        left[0] = 0; right[0] = j;
-
-        /* Split along the y axis, twice. */
-        for ( k = 1 ; k >= 0 ; k-- ) {
-            i = left[k]; j = right[k];
-            while ( i <= j ) {
-                while ( i <= right[k] && parts[i].x[1] < pivot[1] )
-                    i += 1;
-                while ( j >= left[k] && parts[j].x[1] >= pivot[1] )
-                    j -= 1;
-                if ( i < j ) {
-                    temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                    }
-                }
-            left[2*k+1] = i; right[2*k+1] = right[k];
-            left[2*k] = left[k]; right[2*k] = j;
-            }
-
-        /* Split along the z axis, four times. */
-        for ( k = 3 ; k >= 0 ; k-- ) {
-            i = left[k]; j = right[k];
-            while ( i <= j ) {
-                while ( i <= right[k] && parts[i].x[2] < pivot[2] )
-                    i += 1;
-                while ( j >= left[k] && parts[j].x[2] >= pivot[2] )
-                    j -= 1;
-                if ( i < j ) {
-                    temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                    }
-                }
-            left[2*k+1] = i; right[2*k+1] = right[k];
-            left[2*k] = left[k]; right[2*k] = j;
-            }
-
-        /* Store the counts and offsets. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            c->progeny[k]->count = right[k] - left[k] + 1;
-            c->progeny[k]->parts = &c->parts[ left[k] ];
-            }
-            
-        /* Recurse. */
-        for ( k = 0 ; k < 8 ; k++ )
-            cell_split( c->progeny[k] , s );
-            
-        /* Link the COM tasks. */
-        for ( k = 0 ; k < 8 ; k++ )
-            qsched_addunlock( s , c->progeny[k]->com_tid , c->com_tid );
-            
-        } /* does the cell need to be split? */
-        
-    /* Set this cell's resources ownership. */
-    qsched_res_own( s , c->res , s->nr_queues * ( c->parts - root->parts ) / root->count );
-        
-    }
-    
-    
-/**
- * @brief Compute the center of mass of a given cell.
- *
- * @param c The #cell.
- */
- 
-void comp_com ( struct cell *c ) {
-
-    int k, count = c->count;
-    struct cell *cp;
-    struct part *p, *parts = c->parts;
-
-    /* Is the cell split? */
-    if ( c->split ) {
-    
-        /* Collect the centre of gravity and mass from the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            cp = c->progeny[k];
-            c->com[0] += cp->com[0]*cp->mass;
-            c->com[1] += cp->com[1]*cp->mass;
-            c->com[2] += cp->com[2]*cp->mass;
-            c->mass += cp->mass;
-            }
-        c->com[0] /= c->mass; c->com[1] /= c->mass; c->com[2] /= c->mass;
-            
-        }
-        
-    /* Otherwise, collect the multipole from local data. */
-    else {
-    
-        for ( k = 0 ; k < count ; k++ ) {
-            p = &parts[k];
-            c->com[0] += p->x[0]*p->mass;
-            c->com[1] += p->x[1]*p->mass;
-            c->com[2] += p->x[2]*p->mass;
-            c->mass += p->mass;
-            }
-        c->com[0] /= c->mass; c->com[1] /= c->mass; c->com[2] /= c->mass;
-            
-        }
-        
-    }
-    
-    
-/**
- * @brief Compute the interactions between all particles in a cell
- *        and the center of mass of another cell.
- *
- * @param ci The #cell containing the particles.
- * @param cj The #cell containing the center of mass.
- */
- 
-void iact_pair_pc ( struct cell *ci , struct cell *cj ) {
-
-    int j, k, count = ci->count;
-    double com[3], mcom, dx[3], r2, ir, w;
-    struct part *parts = ci->parts;
-    
-    /* Early abort? */
-    if ( count == 0 || cj->count == 0 )
-        return;
-        
-    /* message( "ci=[%.3e,%.3e,%.3e], cj=[%.3e,%.3e,%.3e], h=%.3e/%.3e.",
-        ci->loc[0], ci->loc[1], ci->loc[2], 
-        cj->loc[0], cj->loc[1], cj->loc[2],
-        ci->h[0], cj->h[0] ); */
-    
-    /* Sanity check. */
-    if ( cj->mass == 0.0 )
-        error( "com does not seem to have been set." );
-    
-    /* Init the com's data. */
-    for ( k = 0 ; k < 3 ; k++ )
-        com[k] = cj->com[k];
-    mcom = cj->mass;
-
-    /* Loop over every following particle. */
-    for ( j = 0 ; j < count ; j++ ) {
-
-        /* Compute the pairwise distance. */
-        for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-            dx[k] = com[k] - parts[j].x[k];
-            r2 += dx[k]*dx[k];
-            }
-
-        /* Apply the gravitational acceleration. */
-        ir = 1.0 / sqrt( r2 );
-        w = mcom * const_G * ir * ir * ir;
-        for ( k = 0 ; k < 3 ; k++ )
-            parts[j].a[k] += w * dx[k];
-
-        } /* loop over every other particle. */
-                
-    }
-    
-
-/**
- * @brief Compute the interactions between all particles in a cell.
- *
- * @param ci The #cell.
- * @param cj The other #cell.
- */
- 
-void iact_pair ( struct cell *ci , struct cell *cj ) {
-
-    int i, j, k;
-    int count_i = ci->count, count_j = cj->count;
-    double xi[3], ai[3], mi, mj, dx[3], r2, ir, w;
-    struct part *parts_i = ci->parts, *parts_j = cj->parts;
-    
-    /* Early abort? */
-    if ( count_i == 0 || count_j == 0 )
-        return;
-    
-    /* Get the minimum distance between both cells. */
-    for ( r2 = 0.0, k = 0 ; k < 3 ; k++ ) {
-        dx[k] = ci->loc[k] - cj->loc[k];
-        if ( dx[k] < 0 )
-            dx[k] = -dx[k] - ci->h[k];
-        else if ( dx[k] > 0 )
-            dx[k] -= cj->h[k];
-        r2 += dx[k]*dx[k];
-        }
-        
-    // Cells at different level?
-    if ( ci->depth > cj->depth ) {
-    
-        // If well separated, go ahead!
-        if ( r2 > dist_min*dist_min*ci->h[0]*cj->h[0] )
-            iact_pair_pc( ci , cj );
-        
-        // Otherwise, split cj as well.
-        else
-            for ( k = 0 ; k < 8 ; k++ )
-                iact_pair( ci , cj->progeny[k] );
-    
-        }
-
-    /* Sufficiently well-separated? */
-    else if ( r2 > dist_min*dist_min*ci->h[0]*cj->h[0] ) {
-
-        /* Compute the center of mass interactions. */
-        iact_pair_pc( ci , cj );
-
-        }
-
-    /* Recurse? */
-    else if ( ci->split && cj->split )
-        for ( k = 0 ; k < 8 ; k++ ) {
-            iact_pair( cj->progeny[k] , ci );
-            iact_pair( ci->progeny[k] , cj );
-            }
-            
-    /* Otherwise, do direct interactions. */
-    else if ( ci < cj ) {
-
-        /* message( "ci=[%.3e,%.3e,%.3e], cj=[%.3e,%.3e,%.3e], h=%.3e/%.3e.",
-            ci->loc[0], ci->loc[1], ci->loc[2], 
-            cj->loc[0], cj->loc[1], cj->loc[2],
-            ci->h[0], cj->h[0] ); */
-    
-        /* Loop over all particles... */
-        for ( i = 0 ; i < count_i ; i++ ) {
-
-            /* Init the ith particle's data. */
-            for ( k = 0 ; k < 3 ; k++ ) {
-                xi[k] = parts_i[i].x[k];
-                ai[k] = 0.0;
-                }
-            mi = parts_i[i].mass;
-
-            /* Loop over every following particle. */
-            for ( j = 0 ; j < count_j ; j++ ) {
-
-                /* Compute the pairwise distance. */
-                for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = xi[k] - parts_j[j].x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Apply the gravitational acceleration. */
-                ir = 1.0 / sqrt( r2 );
-                w = const_G * ir * ir * ir;
-                mj = parts_j[j].mass;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    parts_j[j].a[k] += w * dx[k] * mi;
-                    ai[k] -= w * dx[k] * mj;
-                    }
-
-                } /* loop over every other particle. */
-
-            /* Store the accumulated acceleration on the ith part. */
-            for ( k = 0 ; k < 3 ; k++ )
-                parts_i[i].a[k] += ai[k];
-
-            } /* loop over all particles. */
-
-        } /* otherwise, compute interactions directly. */
-
-    }
-    
-
-/**
- * @brief Compute the interactions between all particles in a cell.
- *
- * @param c The #cell.
- */
- 
-void iact_self ( struct cell *c ) {
-
-    int i, j, k, count = c->count;
-    double xi[3], ai[3], mi, mj, dx[3], r2, ir, w;
-    struct part *parts = c->parts;
-    
-    /* Early abort? */
-    if ( count == 0 )
-        return;
-    
-    /* message( "cell=[%.3e,%.3e,%.3e], h=%.3e.",
-        c->loc[0], c->loc[1], c->loc[2], c->h[0] ); */
-    
-    /* Recurse? */
-    if ( c->split )
-        for ( j = 0 ; j < 8 ; j++ ) {
-            iact_self( c->progeny[j] );
-            for ( k = j+1 ; k < 8 ; k++ )
-                iact_pair( c->progeny[j] , c->progeny[k] );
-            }
-            
-    /* Otherwise, compute interactions directly. */
-    else {
-    
-        /* Loop over all particles... */
-        for ( i = 0 ; i < count ; i++ ) {
-        
-            /* Init the ith particle's data. */
-            for ( k = 0 ; k < 3 ; k++ ) {
-                xi[k] = parts[i].x[k];
-                ai[k] = 0.0;
-                }
-            mi = parts[i].mass;
-                
-            /* Loop over every following particle. */
-            for ( j = i+1 ; j < count ; j++ ) {
-            
-                /* Compute the pairwise distance. */
-                for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = xi[k] - parts[j].x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-                    
-                /* Apply the gravitational acceleration. */
-                ir = 1.0 / sqrt( r2 );
-                w = const_G * ir * ir * ir;
-                mj = parts[j].mass;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    parts[j].a[k] += w * dx[k] * mi;
-                    ai[k] -= w * dx[k] * mj;
-                    }
-            
-                } /* loop over every other particle. */
-                
-            /* Store the accumulated acceleration on the ith part. */
-            for ( k = 0 ; k < 3 ; k++ )
-                parts[i].a[k] += ai[k];
-        
-            } /* loop over all particles. */
-    
-        } /* otherwise, compute interactions directly. */
-
-    }
-    
-
-/**
- * @brief Create the tasks for the cell pair/self.
- *
- * @param s The #sched in which to create the tasks.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
- 
-void create_tasks ( struct qsched *s , struct cell *ci , struct cell *cj ) {
-
-    int j, k;
-    qsched_task_t tid;
-    struct cell *data[2];
-    double dx, r2;
-    
-    /* If either cell is empty, stop. */
-    if ( ci->count == 0 || ( cj != NULL && cj->count == 0 ) )
-        return;
-
-    /* Single cell? */
-    if ( cj == NULL ) {
-    
-        /* Is this cell split? */
-        if ( ci->split && ci->count > task_limit ) {
-        
-            /* Recurse over the progeny. */
-            for ( j = 0 ; j < 8 ; j++ ) {
-            
-                /* Make self-task. */
-                create_tasks( s , ci->progeny[j] , NULL );
-                
-                /* Make all pair tasks. */
-                for ( k = j+1 ; k < 8 ; k++ )
-                    create_tasks( s , ci->progeny[j] , ci->progeny[k] );
-                }
-        
-            }
-            
-        /* Otherwise, add a self-interaction task. */
-        else {
-            
-            /* Set the data. */
-            data[0] = ci; data[1] = NULL;
-            
-            /* Create the task. */
-            tid = qsched_addtask( s , task_type_self , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count*ci->count/2 );
-            
-            /* Add the resource. */
-            qsched_addlock( s , tid , ci->res );
-            
-            /* If this call might recurse, add a dependency on the cell's COM. */
-            if ( ci->split )
-                qsched_addunlock( s , ci->com_tid , tid );
-        
-            }
-    
-        }
-        
-    /* Otherwise, it's a pair. */
-    else {
-    
-        /* Get the minimum distance between both cells. */
-        for ( r2 = 0.0, k = 0 ; k < 3 ; k++ ) {
-            dx = ci->loc[k] - cj->loc[k];
-            if ( dx < 0 )
-                dx = -dx - ci->h[k];
-            else if ( dx > 0 )
-                dx -= cj->h[k];
-            r2 += dx*dx;
-            }
-        
-        // Cells at different level?
-        if ( cj->depth < ci->depth ) {
-
-            // If well separated, go ahead!
-            if ( r2 > dist_min*dist_min*ci->h[0]*cj->h[0] ) {
-                data[0] = ci; data[1] = cj;
-                tid = qsched_addtask( s , task_type_pair_pc , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count );
-                qsched_addlock( s , tid , ci->res );
-                qsched_addunlock( s , cj->com_tid , tid );
-                }
-
-            // Otherwise, split cj as well.
-            else
-                for ( k = 0 ; k < 8 ; k++ )
-                    create_tasks( s , ci , cj->progeny[k] );
-
-            }
-
-        /* Are the cells sufficiently well separated? */
-        else if ( r2 > dist_min*dist_min*ci->h[0]*ci->h[0] ) {
-
-            /* Interact ci's parts with cj as a cell. */
-            data[0] = ci; data[1] = cj;
-            tid = qsched_addtask( s , task_type_pair_pc , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count );
-            qsched_addlock( s , tid , ci->res );
-            qsched_addunlock( s , cj->com_tid , tid );
-
-            }
-            
-        /* Does this task need to be broken-down further? */
-        else if ( ci->split && cj->split &&
-                  ci->count > task_limit && cj->count > task_limit ) {
-             
-            /* Loop over all pairs between ci and cj's progeny. */
-            for ( k = 0 ; k < 8 ; k++ ) {
-                create_tasks( s , ci->progeny[k] , cj );
-                create_tasks( s , cj->progeny[k] , ci );
-                }
-        
-            }
-            
-        /* Otherwise, generate a part-part task. */
-        else if ( ci < cj ) {
-        
-            /* Set the data. */
-            data[0] = ci; data[1] = cj;
-
-            /* Create the task. */
-            tid = qsched_addtask( s , task_type_pair , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count*cj->count );
-
-            /* Add the resources. */
-            qsched_addlock( s , tid , ci->res );
-            qsched_addlock( s , tid , cj->res );
-
-            /* Depend on the COMs in case this task recurses. */
-            if ( ci->split || cj->split ) {
-                qsched_addunlock( s , ci->com_tid , tid );
-                qsched_addunlock( s , cj->com_tid , tid );
-                }
-
-            }
-        
-        } /* otherwise, it's a pair. */
-
-    }
-    
-    
-/**
- * @brief Set up and run a task-based Barnes-Hutt N-body solver.
- *
- * @param N The number of random particles to use.
- * @param nr_threads Number of threads to use.
- */
- 
-void test_bh ( int N , int nr_threads , int runs ) {
-
-    int k;
-    struct cell *root;
-    struct part *parts;
-    struct qsched s;
-    ticks tic, toc_run, tot_setup = 0, tot_run = 0;
-    
-    /* Runner function. */
-    void runner ( int type , void *data ) {
-    
-        ticks tic = getticks();
-    
-        /* Decode the data. */
-        struct cell **d = (struct cell **)data;
-    
-        /* Decode and execute the task. */
-        switch ( type ) {
-            case task_type_self:
-                iact_self( d[0] );
-                break;
-            case task_type_pair:
-                iact_pair( d[0] , d[1] );
-                break;
-            case task_type_pair_pc:
-                iact_pair_pc( d[0] , d[1] );
-                break;
-            case task_type_com:
-                comp_com( d[0] );
-                break;
-            default:
-                error( "Unknown task type." );
-            }
-            
-        atomic_add( &task_timers[type] , getticks() - tic );
-        
-        }
-        
-    /* Initialize the per-task type timers. */
-    for ( k = 0 ; k < task_type_count ; k++ )
-        task_timers[k] = 0;
-    
-    /* Initialize the scheduler. */
-    qsched_init( &s , nr_threads , qsched_flag_noreown );
-    
-    /* Init and fill the particle array. */
-    if ( ( parts = (struct part *)malloc( sizeof(struct part) * N ) ) == NULL )
-        error( "Failed to allocate particle buffer." );
-    // int hN = ceil( cbrt( N ) );
-    // double h = 1.0 / hN;
-    for ( k = 0 ; k < N ; k++ ) {
-        parts[k].id = k;
-        parts[k].x[0] = ((double)rand())/RAND_MAX; // h*(k % hN);
-        parts[k].x[1] = ((double)rand())/RAND_MAX; // h*((k / hN) % hN);
-        parts[k].x[2] = ((double)rand())/RAND_MAX; // h*(k / hN / hN);
-        parts[k].mass = ((double)rand())/RAND_MAX; // 1.0;
-        parts[k].a[0] = 0.0;
-        parts[k].a[1] = 0.0;
-        parts[k].a[2] = 0.0;
-        }
-        
-    /* Init the cells. */
-    root = cell_get();
-    root->loc[0] = 0.0; root->loc[1] = 0.0; root->loc[2] = 0.0;
-    root->h[0] = 1.0; root->h[1] = 1.0; root->h[2] = 1.0;
-    root->count = N;
-    root->parts = parts;
-    cell_split( root , &s );
-
-    /* Create the tasks. */
-    tic = getticks();
-    create_tasks( &s , root , NULL );
-    tot_setup += getticks() - tic;
-
-    /* Dump the number of tasks. */
-    message( "total nr of tasks: %i." , s.count );
-    message( "total nr of deps: %i." , s.count_deps );
-    message( "total nr of res: %i." , s.count_res );
-    message( "total nr of locks: %i." , s.count_locks );
-    message( "total nr of uses: %i." , s.count_uses );
-    int counts[ task_type_count ];
-    for ( k = 0 ; k < task_type_count ; k++ )
-        counts[k] = 0;
-    for ( k = 0 ; k < s.count ; k++ )
-        counts[ s.tasks[k].type ] += 1;
-    printf( "task counts: [ " );
-    for ( k = 0 ; k < task_type_count ; k++ )
-        printf( "%i " , counts[k] );
-    printf( "].\n" );
-        
-    /* Loop over the number of runs. */
-    for ( k = 0 ; k < runs ; k++ ) {
-    
-        /* Execute the tasks. */
-        tic = getticks();
-        qsched_run( &s , nr_threads , runner );
-	    toc_run = getticks();
-	    message( "%ith run took %lli ticks..." , k , toc_run - tic );
-        tot_run += toc_run - tic;
-        
-        }
-        
-    /* Dump the tasks. */
-    /* for ( k = 0 ; k < s.count ; k++ )
-        printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , s.tasks[k].tic , s.tasks[k].toc ); */
-        
-    /* Dump the costs. */
-    message( "costs: setup=%lli ticks, run=%lli ticks." ,
-        tot_setup , tot_run/runs );
-        
-    /* Dump the timers. */
-    for ( k = 0 ; k < qsched_timer_count ; k++ )
-        message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
-    
-    /* Dump the per-task type timers. */
-    printf( "task timers: [ " );
-    for ( k = 0 ; k < task_type_count ; k++ )
-        printf( "%lli " , task_timers[k]/runs );
-    printf( "] ticks.\n" );
-    
-    /* Clean up. */
-    qsched_free( &s );
-    
-    }
-    
-
-/**
- * @brief Main function.
- */
- 
-int main ( int argc , char *argv[] ) {
-
-    int c, nr_threads;
-    int N = 1000, runs = 1;
-    
-    /* Get the number of threads. */
-    #pragma omp parallel shared(nr_threads)
-    {
-        if ( omp_get_thread_num() == 0 )
-            nr_threads = omp_get_num_threads();
-    }
-    
-    /* Parse the options */
-    while ( ( c = getopt( argc , argv  , "n:r:t:" ) ) != -1 )
-        switch( c ) {
-	        case 'n':
-	            if ( sscanf( optarg , "%d" , &N ) != 1 )
-	                error( "Error parsing number of particles." );
-	            break;
-            case 'r':
-                if ( sscanf( optarg , "%d" , &runs ) != 1 )
-                    error( "Error parsing number of runs." );
-                break;
-	        case 't':
-	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
-	                error( "Error parsing number of threads." );
-	            omp_set_num_threads( nr_threads );
-	            break;
-	        case '?':
-                fprintf( stderr , "Usage: %s [-t nr_threads] [-n N] [-r runs]\n" , argv[0] );
-                fprintf( stderr , "Solves the N-body problem using a Barnes-Hutt\n"
-                                  "tree code with N random particles in [0,1]^3 using\n"
-                                  "nr_threads threads.\n" );
-	            exit( EXIT_FAILURE );
-	        }
-            
-    /* Dump arguments. */
-    message( "Computing the N-body problem over %i particles using %i threads (%i runs)." ,
-        N , nr_threads , runs );
-        
-    /* Run the test. */
-    test_bh( N , nr_threads, runs );
-    
-    return 0;
-    
-    }
-    
-    
diff --git a/examples/test_bh_2.c b/examples/test_bh_2.c
deleted file mode 100644
index b4c06898b9f00cde39a81c46e6b4df1a2aa1e05f..0000000000000000000000000000000000000000
--- a/examples/test_bh_2.c
+++ /dev/null
@@ -1,1252 +0,0 @@
-/*******************************************************************************
- * This file is part of QuickSched.
- * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- * 
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- * 
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- * 
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- * 
- ******************************************************************************/
-
-
-/* Config parameters. */
-#include "../config.h"
-
-/* Standard includes. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <math.h>
-#include <omp.h>
-
-/* Local includes. */
-#include "quicksched.h"
-
-
-/* Some local constants. */
-#define cell_pool_grow 100
-#define cell_maxparts 20
-#define task_limit 5000
-#define const_G 6.6738e-11
-#define dist_min 0.5 // 0.5
-
-
-#define ICHECK -1
-
-/** Data structure for the particles. */
-struct part {
-    double x[3];
-    double a[3];
-    double a_legacy[3];
-    double a_exact[3];
-    double mass;
-    int id;
-    };
-   
-    
-/** Data structure for the BH tree cell. */
-struct cell {
-    double loc[3];
-    double h[3];
-    int split, count;
-    struct part *parts;
-    struct cell *progeny[8];
-    struct cell *parent;
-    double com[3];
-    double mass;
-    double com_legacy[3];
-    double mass_legacy;
-    int res, com_tid, depth;
-    };
-    
-    
-/** Task types. */
-enum task_type {
-    task_type_self = 0,
-    task_type_pair,
-    task_type_pair_pc,
-    task_type_com,
-    task_type_count
-    };
-    
-/** Per-type timers. */
-ticks task_timers[ task_type_count ];
-    
-    
-/** Global variable for the pool of allocated cells. */
-struct cell *cell_pool = NULL;
-
-
-/**
- * @brief Get a #cell from the pool.
- */
- 
-struct cell *cell_get ( ) {
-
-    struct cell *res;
-    int k;
-
-    /* Allocate a new batch? */
-    if ( cell_pool == NULL ) {
-    
-        /* Allocate the cell array. */
-        if ( ( cell_pool = (struct cell *)malloc( sizeof(struct cell) * cell_pool_grow ) ) == NULL )
-            error( "Failed to allocate fresh batch of cells." );
-            
-        /* Clear the cells. */
-        bzero( cell_pool , sizeof(struct cell) * cell_pool_grow );
-        
-        /* Link them up via their progeny pointers. */
-        for ( k = 1 ; k < cell_pool_grow ; k++ )
-            cell_pool[k-1].progeny[0] = &cell_pool[k];
-    
-        }
-        
-    /* Pick a cell off the pool. */
-    res = cell_pool;
-    cell_pool = cell_pool->progeny[0];
-    
-    /* Clean up a few things. */
-    res->res = qsched_res_none;
-    
-    /* Return the cell. */
-    return res;
-
-    }
-    
-    
-/**
- * @brief Sort the parts into eight bins along the given pivots and
- *        fill the multipoles. Also adds the hierarchical resources
- *        to the sched.
- *
- * @param c The #cell to be split.
- * @param N The total number of parts.
- * @param s The #sched to store the resources.
- */
- 
-void cell_split ( struct cell *c , struct qsched *s ) {
-
-    int i, j, k, count = c->count;
-    struct part temp, *parts = c->parts;
-    struct cell *cp;
-    int left[8], right[8];
-    double pivot[3];
-    static struct cell *root = NULL;
-    
-    /* Set the root cell. */
-    if ( root == NULL ) {
-        root = c;
-        c->depth = 0;
-        }
-    
-    /* Add a resource for this cell if it doesn't have one yet. */
-    if ( c->res == qsched_res_none ) 
-        c->res = qsched_addres( s , qsched_owner_none , qsched_res_none );
-    
-    /* Attach a center-of-mass task to the cell. */
-    c->com_tid = qsched_addtask( s , task_type_com , task_flag_none , &c , sizeof(struct cell *) , 1 );
-    
-    /* Does this cell need to be split? */
-    if ( count > cell_maxparts ) {
-    
-        /* Mark this cell as split. */
-        c->split = 1;
-    
-        /* Create the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            c->progeny[k] = cp = cell_get();
-            cp->parent = c;
-            cp->depth = c->depth + 1;
-            cp->loc[0] = c->loc[0];
-            cp->loc[1] = c->loc[1];
-            cp->loc[2] = c->loc[2];
-            cp->h[0] = c->h[0]/2;
-            cp->h[1] = c->h[1]/2;
-            cp->h[2] = c->h[2]/2;
-            cp->res = qsched_addres( s , qsched_owner_none , c->res );
-            if ( k & 4 )
-                cp->loc[0] += cp->h[0];
-            if ( k & 2 )
-                cp->loc[1] += cp->h[1];
-            if ( k & 1 )
-                cp->loc[2] += cp->h[2];
-            }
-    
-        /* Init the pivots. */
-        for ( k = 0 ; k < 3 ; k++ )
-            pivot[k] = c->loc[k] + c->h[k]/2;
-
-        /* Split along the x-axis. */
-        i = 0; j = count - 1;
-        while ( i <= j ) {
-            while ( i <= count-1 && parts[i].x[0] < pivot[0] )
-                i += 1;
-            while ( j >= 0 && parts[j].x[0] >= pivot[0] )
-                j -= 1;
-            if ( i < j ) {
-                temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                }
-            }
-        left[1] = i; right[1] = count - 1;
-        left[0] = 0; right[0] = j;
-
-        /* Split along the y axis, twice. */
-        for ( k = 1 ; k >= 0 ; k-- ) {
-            i = left[k]; j = right[k];
-            while ( i <= j ) {
-                while ( i <= right[k] && parts[i].x[1] < pivot[1] )
-                    i += 1;
-                while ( j >= left[k] && parts[j].x[1] >= pivot[1] )
-                    j -= 1;
-                if ( i < j ) {
-                    temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                    }
-                }
-            left[2*k+1] = i; right[2*k+1] = right[k];
-            left[2*k] = left[k]; right[2*k] = j;
-            }
-
-        /* Split along the z axis, four times. */
-        for ( k = 3 ; k >= 0 ; k-- ) {
-            i = left[k]; j = right[k];
-            while ( i <= j ) {
-                while ( i <= right[k] && parts[i].x[2] < pivot[2] )
-                    i += 1;
-                while ( j >= left[k] && parts[j].x[2] >= pivot[2] )
-                    j -= 1;
-                if ( i < j ) {
-                    temp = parts[i]; parts[i] = parts[j]; parts[j] = temp;
-                    }
-                }
-            left[2*k+1] = i; right[2*k+1] = right[k];
-            left[2*k] = left[k]; right[2*k] = j;
-            }
-
-        /* Store the counts and offsets. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            c->progeny[k]->count = right[k] - left[k] + 1;
-            c->progeny[k]->parts = &c->parts[ left[k] ];
-            }
-            
-        /* Recurse. */
-        for ( k = 0 ; k < 8 ; k++ )
-            cell_split( c->progeny[k] , s );
-            
-        /* Link the COM tasks. */
-        for ( k = 0 ; k < 8 ; k++ )
-            qsched_addunlock( s , c->progeny[k]->com_tid , c->com_tid );
-            
-        } /* does the cell need to be split? */
-        
-    /* Set this cell's resources ownership. */
-    qsched_res_own( s , c->res , s->nr_queues * ( c->parts - root->parts ) / root->count );
-        
-    }
-    
-    
-/**
- * @brief Compute the center of mass of a given cell.
- *
- * @param c The #cell.
- */
- 
-void comp_com ( struct cell *c ) {
-
-    int k, count = c->count;
-    struct cell *cp;
-    struct part *p, *parts = c->parts;
-
-    c->com[0] = c->com[1] = c->com[2] = c->mass = 0.;
-
-    /* Is the cell split? */
-    if ( c->split ) {
-    
-        /* Collect the centre of gravity and mass from the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            cp = c->progeny[k];
-            c->com[0] += cp->com[0]*cp->mass;
-            c->com[1] += cp->com[1]*cp->mass;
-            c->com[2] += cp->com[2]*cp->mass;
-            c->mass += cp->mass;
-            }
-
-	if ( c-> mass > 0. )
-	  {
-	    c->com[0] /= c->mass; 
-	    c->com[1] /= c->mass; 
-	    c->com[2] /= c->mass;
-	  }
-	else
-	  {
-	    c->com[0] = 0.;
-	    c->com[1] = 0.;
-	    c->com[2] = 0.;
-	  }
-            
-        }
-        
-    /* Otherwise, collect the multipole from local data. */
-    else {
-    
-        for ( k = 0 ; k < count ; k++ ) {
-            p = &parts[k];
-            c->com[0] += p->x[0]*p->mass;
-            c->com[1] += p->x[1]*p->mass;
-            c->com[2] += p->x[2]*p->mass;
-            c->mass += p->mass;
-            }
-
-	if ( c-> mass > 0. )
-	  {
-	    c->com[0] /= c->mass; 
-	    c->com[1] /= c->mass; 
-	    c->com[2] /= c->mass;
-	  }
-	else
-	  {
-	    c->com[0] = 0.;
-	    c->com[1] = 0.;
-	    c->com[2] = 0.;
-	  }            
-        }
-        
-    }
-    
-    
-/**
- * @brief Compute the interactions between all particles in a cell
- *        and the center of mass of another cell.
- *
- * @param ci The #cell containing the particles.
- * @param cj The #cell containing the center of mass.
- */
- 
-void iact_pair_pc ( struct cell *ci , struct cell *cj ) {
-
-    int j, k, count = ci->count;
-    double com[3], mcom, dx[3], r2, ir, w;
-    struct part *parts = ci->parts;
-    
-    /* Early abort? */
-    if ( count == 0 || cj->count == 0 )
-        return;
-        
-    /* message( "ci=[%.3e,%.3e,%.3e], cj=[%.3e,%.3e,%.3e], h=%.3e/%.3e.",
-        ci->loc[0], ci->loc[1], ci->loc[2], 
-        cj->loc[0], cj->loc[1], cj->loc[2],
-        ci->h[0], cj->h[0] ); */
-    
-    /* Sanity check. */
-    if ( cj->mass == 0.0 ){
-      printf("%e %e %e %d %p\n", cj->com[0], cj->com[1], cj->com[2], cj->count, cj);
-
-      for ( j = 0 ; j < cj->count ; ++j )
-	printf("part %d mass= %e\n", j, cj->parts[j].mass );
-
-        error( "com does not seem to have been set." );
-    }
-    /* Init the com's data. */
-    for ( k = 0 ; k < 3 ; k++ )
-        com[k] = cj->com[k];
-    mcom = cj->mass;
-
-    /* Loop over every particle in ci. */
-    for ( j = 0 ; j < count ; j++ ) {
-
-        /* Compute the pairwise distance. */
-        for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-            dx[k] = com[k] - parts[j].x[k];
-            r2 += dx[k]*dx[k];
-            }
-
-        /* Apply the gravitational acceleration. */
-        ir = 1.0 / sqrt( r2 );
-        w = mcom * const_G * ir * ir * ir;
-        for ( k = 0 ; k < 3 ; k++ )
-            parts[j].a[k] += w * dx[k];
-
-#if ICHECK >= 0
-	if ( parts[j].id == ICHECK )
-	  printf("[NEW] Can interact with the monopole. x= %f %f %f m= %f h= %f\n", com[0], com[1], com[2], mcom, cj->h[0]);
-#endif
-
-        } /* loop over every particle. */
-                
-    }
-    
-
-/**
- * @brief Compute the interactions between all particles in a cell.
- *
- * @param ci The #cell.
- * @param cj The other #cell.
- */
- 
-void iact_pair ( struct cell *ci , struct cell *cj ) {
-  
-  int i, j, k;
-    int count_i = ci->count, count_j = cj->count;
-    double dx[3], xi[3], ai[3], mi, mj, r2, w, ir;
-    struct part *parts_i = ci->parts, *parts_j = cj->parts;
-  
-    /* Early abort? */
-    if ( count_i == 0 || count_j == 0 )
-        return;
-
-      /* Distance between the CoMs */
-      for ( r2 = 0.0, k = 0 ; k < 3 ; k++ ) {
-        dx[k] = fabs( ci->com[k] - cj->com[k] );	
-        r2 += dx[k]*dx[k];
-      }
-
-      double s_max_i = ci->h[0]; 
-      double s_max_j = cj->h[0]; 
-      
-      if ( ( dist_min * dist_min * r2 > s_max_i * s_max_i ) && ( dist_min * dist_min * r2 > s_max_j * s_max_j ) )
-	{
-	  iact_pair_pc( ci, cj );
-	  iact_pair_pc( cj, ci );
-	}
-      else if ( ci->split == 0 && cj->split == 0 )
-	{
-	  /* Do direct summation */
-	
-        /* Loop over all particles... */
-        for ( i = 0 ; i < count_i ; i++ ) {
-
-            /* Init the ith particle's data. */
-            for ( k = 0 ; k < 3 ; k++ ) {
-                xi[k] = parts_i[i].x[k];
-                ai[k] = 0.0;
-                }
-            mi = parts_i[i].mass;
-
-            /* Loop over every following particle. */
-            for ( j = 0 ; j < count_j ; j++ ) {
-
-                /* Compute the pairwise distance. */
-                for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = xi[k] - parts_j[j].x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-
-                /* Apply the gravitational acceleration. */
-                ir = 1.0 / sqrt( r2 );
-                w = const_G * ir * ir * ir;
-                mj = parts_j[j].mass;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    parts_j[j].a[k] += w * dx[k] * mi;
-                    ai[k] -= w * dx[k] * mj;
-                    }
-
-#if ICHECK >= 0
-		if ( parts_i[i].id == ICHECK )
-		  printf("[NEW] Interaction with particle id= %d (pair i)\n", parts_j[j].id);
-
-		if ( parts_j[j].id == ICHECK )
-		  printf("[NEW] Interaction with particle id= %d (pair j) h_i= %f h_j= %f ci= %p cj= %p count_i= %d count_j= %d d_i= %d d_j= %d\n", parts_i[i].id, ci->h[0], cj->h[0], ci, cj, count_i, count_j,
-			 ci->res, cj->res ) ;
-#endif
-                } /* loop over every other particle. */
-
-            /* Store the accumulated acceleration on the ith part. */
-            for ( k = 0 ; k < 3 ; k++ )
-                parts_i[i].a[k] += ai[k];
-
-            } /* loop over all particles. */
-	  
-
-	}
-      else {
-
-      if ( ci == cj )
-	printf("The impossible has happened\n");  //debug
-
-
-      /* We can split one of the two cells. Let's try the biggest one */
-      if ( ci->h[0] > cj->h[0] ) {
-
-	if (  ci->split ) 
-	  for ( k = 0 ; k < 8 ; k++ )
-	    iact_pair ( ci->progeny[k], cj );
-	
-	else if ( cj->split ) 
-	  for ( k = 0 ; k < 8 ; k++ )
-	    iact_pair ( ci, cj->progeny[k] );
-
-	else
-	  printf("oO\n");
-
-      }
-      else {
-
-	if ( cj->split ) 
-	  for ( k = 0 ; k < 8 ; k++ )
-	    iact_pair ( ci, cj->progeny[k] );
-	
-	else if (  ci->split ) 
-	  for ( k = 0 ; k < 8 ; k++ )
-	    iact_pair ( ci->progeny[k], cj );
-
-	else
-	  printf("Oo\n");
-	
-	    }
-    }
- }
-    
-
-/**
- * @brief Compute the interactions between all particles in a cell.
- *
- * @param c The #cell.
- */
- 
-void iact_self ( struct cell *c ) {
-
-    int i, j, k, count = c->count;
-    double xi[3], ai[3], mi, mj, dx[3], r2, ir, w;
-    struct part *parts = c->parts;
-    
-    /* Early abort? */
-    if ( count == 0 )
-        return;
-
-    //    printf("entering cell at x= %f %f %f h= %f\n",  c->loc[0], c->loc[1], c->loc[2], c->h[0] );
-    
-    /* message( "cell=[%.3e,%.3e,%.3e], h=%.3e.",
-        c->loc[0], c->loc[1], c->loc[2], c->h[0] ); */
-    
-    /* Recurse? */
-    if ( c->split )
-      {
-        for ( j = 0 ; j < 8 ; j++ ) {
-	  iact_self( c->progeny[j] );
-            for ( k = j+1 ; k < 8 ; k++ )
-	      iact_pair( c->progeny[j] , c->progeny[k] );
-            }
-      }
-    /* Otherwise, compute interactions directly. */
-    else {
-
-        /* Loop over all particles... */
-        for ( i = 0 ; i < count ; i++ ) {
-        
-            /* Init the ith particle's data. */
-            for ( k = 0 ; k < 3 ; k++ ) {
-                xi[k] = parts[i].x[k];
-                ai[k] = 0.0;
-                }
-            mi = parts[i].mass;
-                
-            /* Loop over every following particle. */
-            for ( j = i+1 ; j < count ; j++ ) {
-            
-                /* Compute the pairwise distance. */
-                for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-                    dx[k] = xi[k] - parts[j].x[k];
-                    r2 += dx[k]*dx[k];
-                    }
-                    
-                /* Apply the gravitational acceleration. */
-                ir = 1.0 / sqrt( r2 );
-                w = const_G * ir * ir * ir;
-                mj = parts[j].mass;
-                for ( k = 0 ; k < 3 ; k++ ) {
-                    parts[j].a[k] += w * dx[k] * mi;
-                    ai[k] -= w * dx[k] * mj;
-                    }
-
-#if ICHECK >= 0
-		if ( parts[i].id == ICHECK )
-		  printf("[NEW] Interaction with particle id= %d (self i)\n", parts[j].id);
-
-		if ( parts[j].id == ICHECK )
-		  printf("[NEW] Interaction with particle id= %d (self j)\n", parts[i].id);
-#endif
-            
-                } /* loop over every other particle. */
-                
-            /* Store the accumulated acceleration on the ith part. */
-            for ( k = 0 ; k < 3 ; k++ )
-                parts[i].a[k] += ai[k];
-        
-            } /* loop over all particles. */
-    
-        } /* otherwise, compute interactions directly. */
-
-    }
-    
-
-/**
- * @brief Create the tasks for the cell pair/self.
- *
- * @param s The #sched in which to create the tasks.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
- 
-void create_tasks ( struct qsched *s , struct cell *ci , struct cell *cj ) {
-
-    int j, k;
-    qsched_task_t tid;
-    struct cell *data[2];
-    double dx, r2;
-    
-    /* If either cell is empty, stop. */
-    if ( ci->count == 0 || ( cj != NULL && cj->count == 0 ) )
-        return;
-
-    /* Single cell? */
-    if ( cj == NULL ) {
-    
-        /* Is this cell split? */
-        if ( ci->split && ci->count > task_limit ) {
-
-            /* Recurse over the progeny. */
-            for ( j = 0 ; j < 8 ; j++ ) {
-            
-                /* Make self-task. */
-                create_tasks( s , ci->progeny[j] , NULL );
-                
-                /* Make all pair tasks. */
-                for ( k = j+1 ; k < 8 ; k++ )
-                    create_tasks( s , ci->progeny[j] , ci->progeny[k] );
-                }
-        
-            }
-            
-        /* Otherwise, add a self-interaction task. */
-        else {
-            
-            /* Set the data. */
-            data[0] = ci; data[1] = NULL;
-            
-            /* Create the task. */
-            tid = qsched_addtask( s , task_type_self , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count*ci->count/2 );
-            
-            /* Add the resource. */
-            qsched_addlock( s , tid , ci->res );
-            
-            /* If this call might recurse, add a dependency on the cell's COM. */
-            if ( ci->split )
-                qsched_addunlock( s , ci->com_tid , tid );
-        
-            }
-    
-        }
-        
-    /* Otherwise, it's a pair. */
-    else {
-
-
-      //      printf("Pair !\n");
-
-      /* Distance between the CoMs */
-      for ( r2 = 0.0, k = 0 ; k < 3 ; k++ ) {
-        dx = fabs( ci->loc[k] - cj->loc[k] );	
-        r2 += dx*dx;
-      }
-      
-      const double s_max_i = ci->h[0]; 
-      const double s_max_j = cj->h[0]; 
-      
-      /* Check whether we can use the multipoles. */
-      if ( ( dist_min * dist_min * r2 > s_max_i * s_max_i ) && ( dist_min * dist_min * r2 > s_max_j * s_max_j ) )
-	{  
-	  data[0] = ci; data[1] = cj;
-	  tid = qsched_addtask( s , task_type_pair_pc , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count );
-	  qsched_addlock( s , tid , ci->res );
-	  qsched_addunlock( s , cj->com_tid , tid );
-	  
-	  data[0] = cj; data[1] = ci;
-	  tid = qsched_addtask( s , task_type_pair_pc , task_flag_none , data , sizeof(struct cell *) * 2 , cj->count );
-	  qsched_addlock( s , tid , cj->res );
-	  qsched_addunlock( s , ci->com_tid , tid );
-	}
-      
-            
-      /* Otherwise, generate a part-part task. */
-      else if ( ci->split == 0 && cj->split == 0 )
-	{
-        
-	  /* Set the data. */
-	  data[0] = ci; data[1] = cj;
-
-	  /* Create the task. */
-	  tid = qsched_addtask( s , task_type_pair , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count*cj->count );
-
-	  /* Add the resources. */
-	  qsched_addlock( s , tid , ci->res );
-	  qsched_addlock( s , tid , cj->res );
-
-	  /* qsched_addunlock( s , ci->com_tid , tid ); */
-	  /* qsched_addunlock( s , cj->com_tid , tid ); */
-
-
-	}
-
-
-      else if ( ci->count > task_limit && cj->count > task_limit )
-	{
-	  
-	  /* if ( ci == cj ) */
-	  /*   printf("The impossible has happened\n");  //debug */
-	  
-	  /* We can split one of the two cells. Let's try the biggest one */
-	  if ( ci->h[0] > cj->h[0] ) {
-	    
-	    if (  ci->split ) 
-	      for ( k = 0 ; k < 8 ; k++ )
-		create_tasks ( s , ci->progeny[k], cj );
-	    
-	    else if ( cj->split ) 
-	      for ( k = 0 ; k < 8 ; k++ )
-		create_tasks ( s , ci, cj->progeny[k] );
-	    
-	    /* else */
-	    /*   printf("oO\n"); */
-	    
-	  }
-	  else {
-	    
-	    if ( cj->split ) 
-	      for ( k = 0 ; k < 8 ; k++ )
-		create_tasks ( s , ci, cj->progeny[k] );
-	    
-	    else if (  ci->split ) 
-	      for ( k = 0 ; k < 8 ; k++ )
-		create_tasks ( s , ci->progeny[k], cj );
-	    
-	    /* else */
-	    /*   printf("Oo\n"); */
-	    
-	  }
-	}
-
-      /* /\* Create a task if too few particles *\/ */
-      else 
-      	{
-      	  /* Set the data. */
-      	  data[0] = ci; data[1] = cj;
-
-      	  /* Create the task. */
-      	  tid = qsched_addtask( s , task_type_pair , task_flag_none , data , sizeof(struct cell *) * 2 , ci->count*cj->count );
-
-      	  /* Add the resources. */
-      	  qsched_addlock( s , tid , ci->res );
-      	  qsched_addlock( s , tid , cj->res );
-
-      	  /* Depend on the COMs in case this task recurses. */
-      	  if ( ci->split || cj->split ) {
-      	    qsched_addunlock( s , ci->com_tid , tid );
-      	    qsched_addunlock( s , cj->com_tid , tid );
-      	  }
-      	}
-
-	
-
-      } /* otherwise, it's a pair. */
-      
-    }
-
-
-
-
-
-
-
-
-
-
-/**
- * @brief Compute the center of mass of a given cell recursively.
- *
- * @param c The #cell.
- */
- 
-void legacy_comp_com ( struct cell *c , int* countCoMs ) {
-
-    int k, count = c->count;
-    struct cell *cp;
-    struct part *p, *parts = c->parts;
-
-    ++(*countCoMs);
-
-    /* Is the cell split? */
-    if ( c->split ) {
-    
-      
-      for ( k = 0 ; k < 8 ; k++ ) {
-	legacy_comp_com( c->progeny[k] , countCoMs );
-	}
-
-        /* Collect the centre of gravity and mass from the progeny. */
-        for ( k = 0 ; k < 8 ; k++ ) {
-            cp = c->progeny[k];
-            c->com_legacy[0] += cp->com_legacy[0]*cp->mass_legacy;
-            c->com_legacy[1] += cp->com_legacy[1]*cp->mass_legacy;
-            c->com_legacy[2] += cp->com_legacy[2]*cp->mass_legacy;
-            c->mass_legacy += cp->mass_legacy;
-            }
-
-	if ( c-> mass_legacy != 0. )
-	  {
-	    c->com_legacy[0] /= c->mass_legacy;
-	    c->com_legacy[1] /= c->mass_legacy;
-	    c->com_legacy[2] /= c->mass_legacy;
-	  }
-	else
-	  {
-	    c->com_legacy[0] = 0.;
-	    c->com_legacy[1] = 0.;
-	    c->com_legacy[2] = 0.;
-	  }
-            
-        }
-        
-    /* Otherwise, collect the multipole from local data. */
-    else {
-    
-        for ( k = 0 ; k < count ; k++ ) {
-            p = &parts[k];
-            c->com_legacy[0] += p->x[0]*p->mass;
-            c->com_legacy[1] += p->x[1]*p->mass;
-            c->com_legacy[2] += p->x[2]*p->mass;
-            c->mass_legacy += p->mass;
-            }
-
-	if ( c-> mass_legacy != 0. )
-	  {
-	    c->com_legacy[0] /= c->mass_legacy;
-	    c->com_legacy[1] /= c->mass_legacy;
-	    c->com_legacy[2] /= c->mass_legacy;
-	  }
-	else
-	  {
-	    c->com_legacy[0] = 0.;
-	    c->com_legacy[1] = 0.;
-	    c->com_legacy[2] = 0.;
-	  }
-        }
-        
-    }
-
-
-
-
-/**
- * @brief Interacts a particle with a cell recursively using the original B-H tree walk procedure
- *
- * @param parts The array of particles
- * @param i The particle of interest
- * @param cell The cell the particle interacts with
- */
-void legacy_interact( struct part* parts, int i , struct cell* cell , int monitor,  int* countMultipoles, int* countPairs) {
-
-  int j, k;
-  double r2, ir, w, dx[3];
-
-  /* Early abort */
-  if( cell->count == 0 )
-    return;
-
-  if( parts[i].id == monitor )
-    printf( "Particle %d entering cell at x= %f %f %f h= %f\n", parts[i].id , cell->loc[0], cell->loc[1], cell->loc[2], cell->h[0] );
-
-  /* Are we in a leaf ? */
-  if ( ! cell->split )
-    {
-      if( parts[i].id == monitor )
-	printf( "This is a leaf with %d particles.\n", cell->count);
-
-      /* Interact the particle with the particles in the leaf */
-      
-      for( j = 0 ; j < cell->count ; ++ j )
-	{
-	  if( cell->parts[j].id == parts[i].id )
-	    continue;
-
-	  if( parts[i].id == monitor )
-	    printf( "[BH_] Interaction with particle id= %d\n", cell->parts[j].id );
-
-	  /* Compute the pairwise distance. */
-	  for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-	    dx[k] = cell->parts[j].x[k] - parts[i].x[k];
-	    r2 += dx[k]*dx[k];
-	  }
-	  
-	  /* Apply the gravitational acceleration. */
-	  ir = 1.0 / sqrt( r2 );
-	  w = cell->parts[j].mass * const_G * ir * ir * ir;
-	  for ( k = 0 ; k < 3 ; k++ )
-	    parts[i].a_legacy[k] += w * dx[k];
-
-	  (*countPairs)++;
-	}
-    }
-  else
-    {
-      /* We are in a node */
-      for ( r2 = 0.0, k = 0 ; k < 3 ; k++ ) {
-        dx[k] = cell->loc[k] - parts[i].x[k];
-        r2 += dx[k]*dx[k];
-      }
-      
-      if( parts[i].id == monitor )
-	printf( "This is a node with %d particles h= %f. r= %f theta= %f\n", cell->count, cell->h[0], sqrt( r2 ), dist_min );
-
-
-      /* Is the cell far enough ? */
-      if( dist_min*dist_min*r2 > cell->h[0]*cell->h[0]  )
-	{
-	  /* Interact with the monopole */
-	  if( parts[i].id == monitor )
-	    printf( "[BH_] Can interact with the monopole. x= %f %f %f m= %f h= %f\n", cell->com_legacy[0] , cell->com_legacy[1] , cell->com_legacy[2] , cell->mass_legacy , cell->h[0]);
-
-	  for ( r2 = 0.0, k = 0 ; k < 3 ; k++ ) {
-	    dx[k] = cell->com_legacy[k] - parts[i].x[k];
-	    r2 += dx[k]*dx[k];
-	  }
-
-	  /* Apply the gravitational acceleration. */
-	  ir = 1.0 / sqrt( r2 );
-	  w = cell->mass_legacy * const_G * ir * ir * ir;
-	  for ( k = 0 ; k < 3 ; k++ )
-            parts[i].a_legacy[k] += w * dx[k];
-
-	  (*countMultipoles)++;
-	  
-	}
-      else
-	{
-	  if( parts[i].id == monitor )
-	    printf( "Recursing...\n" );
-
-	  /* OK, we need to recurse */
-	  for( k = 0 ; k < 8 ; k++ )
-	    legacy_interact( parts, i, cell->progeny[k] , monitor , countMultipoles , countPairs );
-	}
-    }
-}
-  
-  
-  
-/**
- * @brief Does a tree walk as in the B-H original work for all particles
- *
- * @param N The number of particles
- * @param parts The array of particles
- * @param root The root cell of the tree
- * @param monitor ID of the particle to monitor and output interactions to stdout
- */
-void legacy_tree_walk( int N , struct part* parts , struct cell* root , int monitor , int* countMultipoles, int* countPairs , int* countCoMs ) {
-  
-  int i;
-
-  legacy_comp_com( root , countCoMs );
-  
-  for( i = 0 ; i < N ; ++i )
-    {
-      if ( parts[i].id == monitor )
-	printf( "tree walk for particle %d x= %f %f %f \n", parts[i].id , parts[i].x[0] , parts[i].x[1], parts[i].x[2] );
-
-      legacy_interact( parts , i , root , monitor , countMultipoles , countPairs );
-
-      if ( parts[i].id == monitor )
-	printf( "\n[LEGACY] acceleration for particle %d a= %f %f %f \n", parts[i].id , parts[i].a_legacy[0] , parts[i].a_legacy[1], parts[i].a_legacy[2] );
-
-    }
-
-  
-}
-  
-
-  
-
-
-
-
-
-
-/**
- * @brief Solve the particle interactions using the stupid N^2 algorithm
- *
- * @param N The number of particles
- * @param parts The array of particles
- */
-void interact_exact( int N , struct part* parts , int monitor) {
-
-  int i, j, k;
-  double ir, w, r2, dx[3] = {0., 0., 0.};
-
-  for ( i = 0 ; i < N ; ++i ) {
-      
-    for ( j = i+1 ; j < N ; ++j ) {
-	
-      /* Compute the pairwise distance. */
-      for ( r2 = 0.0 , k = 0 ; k < 3 ; k++ ) {
-	dx[k] = parts[i].x[k] - parts[j].x[k];
-	r2 += dx[k]*dx[k];
-      }
-      
-      
-      /* Apply the gravitational acceleration. */
-      ir = 1.0 / sqrt( r2 );
-      w = const_G * ir * ir * ir;
-
-      for ( k = 0 ; k < 3 ; k++ ) {
-	parts[j].a_exact[k] += w * dx[k] * parts[i].mass;
-	parts[i].a_exact[k] -= w * dx[k] * parts[j].mass;
-      }
-      
-      
-    }
-  }
-
-  for( i = 0 ; i < N ; ++i )
-    if( parts[i].id == monitor )
-      	printf( "[EXACT ] acceleration for particle %d a= %f %f %f \n\n", parts[i].id , parts[i].a_exact[0] , parts[i].a_exact[1], parts[i].a_exact[2] );
-
-}
-
-
-
- 
-/**
- * @brief Set up and run a task-based Barnes-Hutt N-body solver.
- *
- * @param N The number of random particles to use.
- * @param nr_threads Number of threads to use.
- */
- 
-void test_bh ( int N , int nr_threads , int runs ) {
-
-  int i , k;
-    struct cell *root;
-    struct part *parts;
-    struct qsched s;
-    ticks tic, toc_run, tot_setup = 0, tot_run = 0, tic_legacy;
-    int countMultipoles, countPairs, countCoMs;
-
-    
-    /* Runner function. */
-    void runner ( int type , void *data ) {
-    
-        ticks tic = getticks();
-    
-        /* Decode the data. */
-        struct cell **d = (struct cell **)data;
-    
-        /* Decode and execute the task. */
-        switch ( type ) {
-            case task_type_self:
-                iact_self( d[0] );
-                break;
-            case task_type_pair:
-                iact_pair( d[0] , d[1] );
-                break;
-            case task_type_pair_pc:
-                iact_pair_pc( d[0] , d[1] );
-                break;
-            case task_type_com:
-                comp_com( d[0] );
-                break;
-            default:
-                error( "Unknown task type." );
-            }
-            
-        atomic_add( &task_timers[type] , getticks() - tic );
-        
-        }
-        
-    /* Initialize the per-task type timers. */
-    for ( k = 0 ; k < task_type_count ; k++ )
-        task_timers[k] = 0;
-    
-    /* Initialize the scheduler. */
-    qsched_init( &s , nr_threads , qsched_flag_noreown );
-    
-    /* Init and fill the particle array. */
-    if ( ( parts = (struct part *)malloc( sizeof(struct part) * N ) ) == NULL )
-        error( "Failed to allocate particle buffer." );
-
-    // int hN = ceil( cbrt( N ) );
-    // double h = 1.0 / hN;
-    for ( k = 0 ; k < N ; k++ ) {
-        parts[k].id = k;
-        parts[k].x[0] = ((double)rand())/RAND_MAX; // h*(k % hN);
-        parts[k].x[1] = ((double)rand())/RAND_MAX; // h*((k / hN) % hN);
-        parts[k].x[2] = ((double)rand())/RAND_MAX; // h*(k / hN / hN);
-        parts[k].mass = ((double)rand())/RAND_MAX; // 1.0;
-        parts[k].a[0] = 0.0;
-        parts[k].a[1] = 0.0;
-        parts[k].a[2] = 0.0;
-        }
-        
-    /* Init the cells. */
-    root = cell_get();
-    root->loc[0] = 0.0; root->loc[1] = 0.0; root->loc[2] = 0.0;
-    root->h[0] = 1.0; root->h[1] = 1.0; root->h[2] = 1.0;
-    root->count = N;
-    root->parts = parts;
-    cell_split( root , &s );
-
-    printf("----------------------------------------------------------\n");
-
-    /* Do a traditional (serial) B-H tree walk */
-    tic_legacy = getticks();
-    countMultipoles = 0;
-    countPairs = 0;
-    countCoMs = 0;
-    legacy_tree_walk( N , parts , root , ICHECK , &countMultipoles , &countPairs , &countCoMs );
-    printf( "Legacy B-H walk (1 thread) took %lli (= %e) ticks\n", getticks() - tic_legacy , (float)(getticks() - tic_legacy) );
-    
-
-
-    /* Do a N^2 interactions calculation */
-    interact_exact( N , parts , ICHECK );
-
-    printf("----------------------------------------------------------\n");
-
-    /* Create the tasks. */
-    tic = getticks();
-    create_tasks( &s , root , NULL );
-    tot_setup += getticks() - tic;
-
-    /* Dump the number of tasks. */
-    message( "total nr of tasks: %i." , s.count );
-    message( "total nr of deps: %i." , s.count_deps );
-    message( "total nr of res: %i." , s.count_res );
-    message( "total nr of locks: %i." , s.count_locks );
-    message( "total nr of uses: %i." , s.count_uses );
-    int counts[ task_type_count ];
-    for ( k = 0 ; k < task_type_count ; k++ )
-        counts[k] = 0;
-    for ( k = 0 ; k < s.count ; k++ )
-        counts[ s.tasks[k].type ] += 1;
-    printf( "task counts: [ %8s %8s %8s %8s ]\n" , "self", "direct" , "m-poles" , "CoMs" );
-    printf( "task counts: [ " );
-    for ( k = 0 ; k < task_type_count ; k++ )
-        printf( "%8i " , counts[k] );
-    printf( "].\n" );
-
-    printf( "task counts: [ %8i %8i %8i %8i ].\n" , 0 , countPairs , countMultipoles , countCoMs );
-        
-    /* Loop over the number of runs. */
-    for ( k = 0 ; k < runs ; k++ ) {
-
-      for ( i = 0 ; i < N ; ++i ) {
-      	parts[i].a[0] = 0.;
-      	parts[i].a[1] = 0.;
-      	parts[i].a[2] = 0.;
-      }
-
-        /* Execute the tasks. */
-        tic = getticks();
-        qsched_run( &s , nr_threads , runner );
-	toc_run = getticks();
-	message( "%ith run took %lli (= %e) ticks..." , k , toc_run - tic , (float)(toc_run - tic) );
-        tot_run += toc_run - tic;
-        
-        }
-        
-    /* Dump the tasks. */
-    /* for ( k = 0 ; k < s.count ; k++ )
-        printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , s.tasks[k].tic , s.tasks[k].toc ); */
-        
-    /* Dump the costs. */
-    message( "costs: setup=%lli ticks, run=%lli ticks." ,
-        tot_setup , tot_run/runs );
-        
-    /* Dump the timers. */
-    for ( k = 0 ; k < qsched_timer_count ; k++ )
-        message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
-    
-    /* Dump the per-task type timers. */
-    printf( "task timers: [ " );
-    for ( k = 0 ; k < task_type_count ; k++ )
-        printf( "%lli " , task_timers[k]/runs );
-    printf( "] ticks.\n" );
-
-
-    /* Dump the particles to a file */
-    FILE* file = fopen( "particle_dump.dat" , "w" );
-    fprintf(file, "# a_exact.x   a_exact.y    a_exact.z    a_legacy.x    a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");
-    for ( k = 0 ; k < N ; ++k )
-      fprintf ( file , "%d %e %e %e %e %e %e %e %e %e\n" ,
-    		parts[k].id,
-    		parts[k].a_exact[0] , parts[k].a_exact[1] , parts[k].a_exact[2] ,
-    		parts[k].a_legacy[0] , parts[k].a_legacy[1] , parts[k].a_legacy[2] ,
-    		parts[k].a[0] , parts[k].a[1] , parts[k].a[2] );
-    fclose( file );
-    
-    /* Clean up. */
-    qsched_free( &s );
-    
-    }
-    
-
-/**
- * @brief Main function.
- */
- 
-int main ( int argc , char *argv[] ) {
-
-    int c, nr_threads;
-    int N = 1000, runs = 1;
-    
-    /* Get the number of threads. */
-    #pragma omp parallel shared(nr_threads)
-    {
-        if ( omp_get_thread_num() == 0 )
-            nr_threads = omp_get_num_threads();
-    }
-    
-    /* Parse the options */
-    while ( ( c = getopt( argc , argv  , "n:r:t:" ) ) != -1 )
-        switch( c ) {
-	        case 'n':
-	            if ( sscanf( optarg , "%d" , &N ) != 1 )
-	                error( "Error parsing number of particles." );
-	            break;
-            case 'r':
-                if ( sscanf( optarg , "%d" , &runs ) != 1 )
-                    error( "Error parsing number of runs." );
-                break;
-	        case 't':
-	            if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
-	                error( "Error parsing number of threads." );
-	            omp_set_num_threads( nr_threads );
-	            break;
-	        case '?':
-                fprintf( stderr , "Usage: %s [-t nr_threads] [-n N] [-r runs]\n" , argv[0] );
-                fprintf( stderr , "Solves the N-body problem using a Barnes-Hutt\n"
-                                  "tree code with N random particles in [0,1]^3 using\n"
-                                  "nr_threads threads.\n" );
-	            exit( EXIT_FAILURE );
-	        }
-            
-    /* Dump arguments. */
-    message( "Computing the N-body problem over %i particles using %i threads (%i runs)." ,
-        N , nr_threads , runs );
-        
-    /* Run the test. */
-    test_bh( N , nr_threads, runs );
-    
-    return 0;
-    
-    }
-    
-    
diff --git a/examples/test_bh_3.c b/examples/test_bh_3.c
deleted file mode 100644
index 9f17a852c12c112d9600757a8390610424cf817d..0000000000000000000000000000000000000000
--- a/examples/test_bh_3.c
+++ /dev/null
@@ -1,1285 +0,0 @@
-/*******************************************************************************
- * This file is part of QuickSched.
- * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk),
- *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
-* *****************************************************************************/
-
-/* Config parameters. */
-#include "../config.h"
-
-/* Standard includes. */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <math.h>
-#include <omp.h>
-
-/* Local includes. */
-#include "quicksched.h"
-
-/* Some local constants. */
-#define cell_pool_grow 1000
-#define cell_maxparts 16
-#define task_limit 5000
-#define const_G 6.6738e-8
-#define dist_min 0.5  // 0.5
-
-#define ICHECK -1
-
-/** Data structure for the particles. */
-struct part {
-  double x[3];
-  double a[3];
-  double a_legacy[3];
-  double a_exact[3];
-  double mass;
-  int id;
-};
-
-/** Data structure for the BH tree cell. */
-struct cell {
-  double loc[3];
-  double h;
-
-  int split, count;
-  struct part *parts;
-
-  struct cell *firstchild; /* Next node if opening */
-  struct cell *sibling;    /* Next node */
-
-  /* We keep both CoMs and masses to make sure the comp_com calculation is
-   * correct (use an anonymous union to keep variable names compact).  */
-  union {
-
-    /* Information for the legacy walk */
-    struct {
-      double com[3];
-      double mass;
-    } legacy;
-
-    /* Information for the QuickShed walk */
-    struct {
-      double com[3];
-      double mass;
-    } new;
-  };
-
-  int res, com_tid;
-
-} __attribute__((aligned(128)));
-
-/** Task types. */
-enum task_type {
-  task_type_self = 0,
-  task_type_pair,
-  task_type_pair_pc,
-  task_type_com,
-  task_type_count
-};
-
-/** Per-type timers. */
-ticks task_timers[task_type_count];
-
-/** Global variable for the pool of allocated cells. */
-struct cell *cell_pool = NULL;
-
-/**
- * @brief Get a #cell from the pool.
- */
-struct cell *cell_get() {
-
-  struct cell *res;
-  int k;
-
-  /* Allocate a new batch? */
-  if (cell_pool == NULL) {
-
-    /* Allocate the cell array. */
-    if ((cell_pool =
-             (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) ==
-        NULL)
-      error("Failed to allocate fresh batch of cells.");
-
-    /* Link them up via their progeny pointers. */
-    for (k = 1; k < cell_pool_grow; k++)
-      cell_pool[k - 1].firstchild = &cell_pool[k];
-  }
-
-  /* Pick a cell off the pool. */
-  res = cell_pool;
-  cell_pool = cell_pool->firstchild;
-
-  /* Clean up a few things. */
-  res->res = qsched_res_none;
-  res->firstchild = 0;
-
-  /* Return the cell. */
-  return res;
-}
-
-/**
- * @brief Sort the parts into eight bins along the given pivots and
- *        fill the multipoles. Also adds the hierarchical resources
- *        to the sched.
- *
- * @param c The #cell to be split.
- * @param N The total number of parts.
- * @param s The #sched to store the resources.
- */
-void cell_split(struct cell *c, struct qsched *s) {
-
-  int i, j, k, kk, count = c->count;
-  struct part temp, *parts = c->parts;
-  struct cell *cp;
-  int left[8], right[8];
-  double pivot[3];
-  static struct cell *root = NULL;
-  struct cell *progenitors[8];
-
-  /* Set the root cell. */
-  if (root == NULL) {
-    root = c;
-    // c->depth = 0;
-    // c->parent = 0;
-    c->sibling = 0;
-  }
-
-  /* Add a resource for this cell if it doesn't have one yet. */
-  if (c->res == qsched_res_none)
-    c->res = qsched_addres(s, qsched_owner_none, qsched_res_none);
-
-  /* Attach a center-of-mass task to the cell. */
-  if (count > 0)
-    c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c,
-                                sizeof(struct cell *), 1);
-
-  /* Does this cell need to be split? */
-  if (count > cell_maxparts) {
-
-    /* Mark this cell as split. */
-    c->split = 1;
-
-    /* Create the progeny. */
-    for (k = 0; k < 8; k++) {
-      progenitors[k] = cp = cell_get();
-      // cp->parent = c;
-      // cp->depth = c->depth + 1;
-      cp->loc[0] = c->loc[0];
-      cp->loc[1] = c->loc[1];
-      cp->loc[2] = c->loc[2];
-      cp->h = c->h / 2;
-      cp->h = c->h / 2;
-      cp->h = c->h / 2;
-      cp->res = qsched_addres(s, qsched_owner_none, c->res);
-      if (k & 4) cp->loc[0] += cp->h;
-      if (k & 2) cp->loc[1] += cp->h;
-      if (k & 1) cp->loc[2] += cp->h;
-    }
-
-    /* Init the pivots. */
-    for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h / 2;
-
-    /* Split along the x-axis. */
-    i = 0;
-    j = count - 1;
-    while (i <= j) {
-      while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1;
-      while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1;
-      if (i < j) {
-        temp = parts[i];
-        parts[i] = parts[j];
-        parts[j] = temp;
-      }
-    }
-    left[1] = i;
-    right[1] = count - 1;
-    left[0] = 0;
-    right[0] = j;
-
-    /* Split along the y axis, twice. */
-    for (k = 1; k >= 0; k--) {
-      i = left[k];
-      j = right[k];
-      while (i <= j) {
-        while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1;
-        while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1;
-        if (i < j) {
-          temp = parts[i];
-          parts[i] = parts[j];
-          parts[j] = temp;
-        }
-      }
-      left[2 * k + 1] = i;
-      right[2 * k + 1] = right[k];
-      left[2 * k] = left[k];
-      right[2 * k] = j;
-    }
-
-    /* Split along the z axis, four times. */
-    for (k = 3; k >= 0; k--) {
-      i = left[k];
-      j = right[k];
-      while (i <= j) {
-        while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1;
-        while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1;
-        if (i < j) {
-          temp = parts[i];
-          parts[i] = parts[j];
-          parts[j] = temp;
-        }
-      }
-      left[2 * k + 1] = i;
-      right[2 * k + 1] = right[k];
-      left[2 * k] = left[k];
-      right[2 * k] = j;
-    }
-
-    /* Store the counts and offsets. */
-    for (k = 0; k < 8; k++) {
-      progenitors[k]->count = right[k] - left[k] + 1;
-      progenitors[k]->parts = &c->parts[left[k]];
-    }
-
-    /* Prepare the pointers. */
-    for (k = 0; k < 7; k++) {
-
-      /* Find the next non-empty sibling */
-      for (kk = k + 1; kk < 8; ++kk) {
-        if (progenitors[kk]->count > 0) {
-          progenitors[k]->sibling = progenitors[kk];
-          break;
-        }
-      }
-
-      /* No non-empty sibling ? Go back a level */
-      if (kk == 8) progenitors[k]->sibling = c->sibling;
-    }
-
-    /* Last progenitor links to the next sibling */
-    progenitors[7]->sibling = c->sibling;
-    c->firstchild = progenitors[0];
-
-    /* Recurse. */
-    for (k = 0; k < 8; k++) cell_split(progenitors[k], s);
-
-    /* Link the COM tasks. */
-    for (k = 0; k < 8; k++)
-      if (progenitors[k]->count > 0)
-        qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid);
-
-  } /* does the cell need to be split? */
-
-  /* Set this cell's resources ownership. */
-  qsched_res_own(s, c->res,
-                 s->nr_queues * (c->parts - root->parts) / root->count);
-}
-
-/* -------------------------------------------------------------------------- */
-/* New tree walk */
-/* -------------------------------------------------------------------------- */
-
-/**
- * @brief Compute the center of mass of a given cell.
- *
- * @param c The #cell.
- */
-void comp_com(struct cell *c) {
-
-  int k, count = c->count;
-  struct cell *cp;
-  struct part *p, *parts = c->parts;
-  double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
-
-  if (c->split) {
-
-    /* Loop over the projenitors and collect the multipole information. */
-    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-      double cp_mass = cp->new.mass;
-      com[0] += cp->new.com[0] * cp_mass;
-      com[1] += cp->new.com[1] * cp_mass;
-      com[2] += cp->new.com[2] * cp_mass;
-      mass += cp_mass;
-    }
-
-    /* Otherwise, collect the multipole from the particles. */
-  } else {
-
-    for (k = 0; k < count; k++) {
-      p = &parts[k];
-      double p_mass = p->mass;
-      com[0] += p->x[0] * p_mass;
-      com[1] += p->x[1] * p_mass;
-      com[2] += p->x[2] * p_mass;
-      mass += p_mass;
-    }
-  }
-
-  /* Store the COM data, if any was collected. */
-  if (mass > 0.0) {
-    double imass = 1.0 / mass;
-    c->new.com[0] = com[0] * imass;
-    c->new.com[1] = com[1] * imass;
-    c->new.com[2] = com[2] * imass;
-    c->new.mass = mass;
-  } else {
-    c->new.com[0] = 0.0;
-    c->new.com[1] = 0.0;
-    c->new.com[2] = 0.0;
-    c->new.mass = 0.0;
-  }
-}
-
-/**
- * @brief Compute the interactions between all particles in a cell
- *        and the center of mass of another cell.
- *
- * @param ci The #cell containing the particles.
- * @param cj The #cell containing the center of mass.
- */
-void iact_pair_pc(struct cell *ci, struct cell *cj) {
-  int j, k, count = ci->count;
-  double com[3], mcom, dx[3], r2, ir, w;
-  struct part *parts = ci->parts;
-
-  /* Early abort? */
-  if (count == 0 || cj->count == 0) return;
-
-  /* message( "ci=[%.3e,%.3e,%.3e], cj=[%.3e,%.3e,%.3e], h=%.3e/%.3e.",
-      ci->loc[0], ci->loc[1], ci->loc[2],
-      cj->loc[0], cj->loc[1], cj->loc[2],
-      ci->h, cj->h ); */
-
-  /* Sanity check. */
-  if (cj->new.mass == 0.0) {
-    message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1],
-            cj->new.com[2], cj->count, cj, cj->split, cj->sibling);
-
-    for (j = 0; j < cj->count; ++j)
-      message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id);
-
-    error("com does not seem to have been set.");
-  }
-
-  /* Corectness check */
-  if (cj->new.mass != cj->legacy.mass)
-    error("Calculation of the CoM is wrong! m_new=%e m_legacy=%e", cj->new.mass,
-          cj->legacy.mass);
-
-  /* Init the com's data. */
-  for (k = 0; k < 3; k++) com[k] = cj->new.com[k];
-  mcom = cj->new.mass;
-
-  /* Loop over every particle in ci. */
-  for (j = 0; j < count; j++) {
-
-    /* Compute the pairwise distance. */
-    for (r2 = 0.0, k = 0; k < 3; k++) {
-      dx[k] = com[k] - parts[j].x[k];
-      r2 += dx[k] * dx[k];
-    }
-
-    /* Apply the gravitational acceleration. */
-    ir = 1.0 / sqrt(r2);
-    w = mcom * const_G * ir * ir * ir;
-    for (k = 0; k < 3; k++) parts[j].a[k] += w * dx[k];
-
-#if ICHECK >= 0
-    if (parts[j].id == ICHECK)
-      printf("[NEW] Can interact with the monopole. x= %f %f %f m= %f h= %f\n",
-             com[0], com[1], com[2], mcom, cj->h);
-#endif
-
-  } /* loop over every particle. */
-}
-
-/**
- * @brief Compute the interactions between all particles in a cell.
- *
- * @param ci The #cell.
- * @param cj The other #cell.
- */
-void iact_pair(struct cell *ci, struct cell *cj) {
-
-  int i, j, k;
-  int count_i = ci->count, count_j = cj->count;
-  double dx[3], xi[3], ai[3], mi, mj, r2, r2_i, r2_j, w, ir;
-  double cih = ci->h, cjh = cj->h;
-  struct part *parts_i = ci->parts, *parts_j = cj->parts;
-  struct cell *cp;
-
-  /* Early abort? */
-  if (count_i == 0 || count_j == 0) return;
-
-  /* Sanity check */
-  if (ci == cj)
-    error("The impossible has happened: pair interaction between a cell and "
-          "itself.");  // debug
-
-  /* Distance between the CoMs */
-  r2 = 0.0;
-  r2_i = 0.0;
-  r2_j = 0.0;
-  for (k = 0; k < 3; k++) {
-    //   dx[k] = fabs( ci->new.com[k] - cj->new.com[k] );
-    dx[k] = fabs(ci->loc[k] - cj->loc[k]);
-
-    r2 += dx[k] * dx[k];
-    r2_i += (dx[k] - 0.5 * cih) * (dx[k] - 0.5 * cih);
-    r2_j += (dx[k] - 0.5 * cjh) * (dx[k] - 0.5 * cjh);
-  }
-
-  /* If ci and cj are sufficiently well separated, split this interaction
-     into a pair of particle-cell interactions. */
-  if ((dist_min * dist_min * r2_j > ci->h * ci->h) &&
-      (dist_min * dist_min * r2_i > cj->h * cj->h)) {
-    iact_pair_pc(ci, cj);
-    iact_pair_pc(cj, ci);
-
-    /* Otherwise, if neither cell is split, compute the particle-particle
-       interactions directly. */
-  } else if (!ci->split && !cj->split) {
-
-    /* Loop over all particles in ci... */
-    for (i = 0; i < count_i; i++) {
-
-      /* Init the ith particle's data. */
-      for (k = 0; k < 3; k++) {
-        xi[k] = parts_i[i].x[k];
-        ai[k] = 0.0;
-      }
-      mi = parts_i[i].mass;
-
-      /* Loop over every following particle. */
-      for (j = 0; j < count_j; j++) {
-
-        /* Compute the pairwise distance. */
-        for (r2 = 0.0, k = 0; k < 3; k++) {
-          dx[k] = xi[k] - parts_j[j].x[k];
-          r2 += dx[k] * dx[k];
-        }
-
-        /* Apply the gravitational acceleration. */
-        ir = 1.0 / sqrt(r2);
-        w = const_G * ir * ir * ir;
-        mj = parts_j[j].mass;
-        for (k = 0; k < 3; k++) {
-          double wdx = w * dx[k];
-          parts_j[j].a[k] += wdx * mi;
-          ai[k] -= wdx * mj;
-        }
-
-#if ICHECK >= 0
-        if (parts_i[i].id == ICHECK)
-          printf("[NEW] Interaction with particle id= %d (pair i)\n",
-                 parts_j[j].id);
-
-        if (parts_j[j].id == ICHECK)
-          printf("[NEW] Interaction with particle id= %d (pair j) h_i= %f h_j= "
-                 "%f ci= %p cj= %p count_i= %d count_j= %d d_i= %d d_j= %d\n",
-                 parts_i[i].id, ci->h, cj->h, ci, cj, count_i, count_j, ci->res,
-                 cj->res);
-#endif
-
-      } /* loop over every other particle. */
-
-      /* Store the accumulated acceleration on the ith part. */
-      for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k];
-
-    } /* loop over all particles. */
-
-    /* Otherwise, compute the interaction recursively over the progeny. */
-  } else {
-
-    /* We can split one of the two cells. Let's try the biggest one first.*/
-    if (ci->h > cj->h) {
-      if (ci->split) {
-        for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling)
-          iact_pair(cp, cj);
-      } else if (cj->split) {
-        for (cp = cj->firstchild; cp != cj->sibling; cp = cp->sibling)
-          iact_pair(ci, cp);
-      }
-
-    } else {
-      if (cj->split) {
-        for (cp = cj->firstchild; cp != cj->sibling; cp = cp->sibling)
-          iact_pair(ci, cp);
-      } else if (ci->split) {
-        for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling)
-          iact_pair(cp, cj);
-      }
-    }
-  }
-}
-
-/**
- * @brief Compute the interactions between all particles in a cell.
- *
- * @param c The #cell.
- */
-void iact_self(struct cell *c) {
-  int i, j, k, count = c->count;
-  double xi[3], ai[3], mi, mj, dx[3], r2, ir, w;
-  struct part *parts = c->parts;
-  struct cell *cp, *cps;
-
-  /* Early abort? */
-  if (count == 0) return;
-
-  /* message( "cell=[%.3e,%.3e,%.3e], h=%.3e.",
-      c->loc[0], c->loc[1], c->loc[2], c->h ); */
-
-  /* If the cell is split, interact each progeny with itself, and with
-     each of its siblings. */
-  if (c->split) {
-    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-      iact_self(cp);
-      for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling)
-        iact_pair(cp, cps);
-    }
-
-    /* Otherwise, compute the interactions directly. */
-  } else {
-
-    /* Loop over all particles... */
-    for (i = 0; i < count; i++) {
-
-      /* Init the ith particle's data. */
-      for (k = 0; k < 3; k++) {
-        xi[k] = parts[i].x[k];
-        ai[k] = 0.0;
-      }
-      mi = parts[i].mass;
-
-      /* Loop over every following particle. */
-      for (j = i + 1; j < count; j++) {
-
-        /* Compute the pairwise distance. */
-        for (r2 = 0.0, k = 0; k < 3; k++) {
-          dx[k] = xi[k] - parts[j].x[k];
-          r2 += dx[k] * dx[k];
-        }
-
-        /* Apply the gravitational acceleration. */
-        ir = 1.0 / sqrt(r2);
-        w = const_G * ir * ir * ir;
-        mj = parts[j].mass;
-        for (k = 0; k < 3; k++) {
-          double wdx = w * dx[k];
-          parts[j].a[k] += wdx * mi;
-          ai[k] -= wdx * mj;
-        }
-
-#if ICHECK >= 0
-        if (parts[i].id == ICHECK)
-          message("[NEW] Interaction with particle id= %d (self i)",
-                  parts[j].id);
-
-        if (parts[j].id == ICHECK)
-          message("[NEW] Interaction with particle id= %d (self j)",
-                  parts[i].id);
-#endif
-
-      } /* loop over every other particle. */
-
-      /* Store the accumulated acceleration on the ith part. */
-      for (k = 0; k < 3; k++) parts[i].a[k] += ai[k];
-
-    } /* loop over all particles. */
-
-  } /* otherwise, compute interactions directly. */
-}
-
-/**
- * @brief Create the tasks for the cell pair/self.
- *
- * @param s The #sched in which to create the tasks.
- * @param ci The first #cell.
- * @param cj The second #cell.
- */
-void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) {
-
-  int k;
-  qsched_task_t tid;
-  struct cell *data[2], *cp, *cps;
-
-  /* If either cell is empty, stop. */
-  if (ci->count == 0 || (cj != NULL && cj->count == 0)) return;
-
-  /* Single cell? */
-  if (cj == NULL) {
-
-    /* Is this cell split? */
-    if (ci->split && ci->count > task_limit) {
-
-      /* Loop over each of this cell's progeny. */
-      for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) {
-
-        /* Make self-interaction task. */
-        create_tasks(s, cp, NULL);
-
-        /* Make all pair-interaction tasks. */
-        for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling)
-          create_tasks(s, cp, cps);
-      }
-
-      /* Otherwise, add a self-interaction task. */
-    } else {
-
-      /* Set the data. */
-      data[0] = ci;
-      data[1] = NULL;
-
-      /* Create the task. */
-      tid =
-          qsched_addtask(s, task_type_self, task_flag_none, data,
-                         sizeof(struct cell *) * 2, ci->count * ci->count / 2);
-
-      /* Add the resource (i.e. the cell) to the new task. */
-      qsched_addlock(s, tid, ci->res);
-
-      /* If this call might recurse, add a dependency on the cell's COM
-         task. */
-      if (ci->split) qsched_addunlock(s, ci->com_tid, tid);
-    }
-
-    /* Otherwise, it's a pair. */
-  } else {
-
-    double dx, r2 = 0.0, r2_i = 0.0, r2_j = 0.0;
-
-    /* Distance between the cells */
-    for (k = 0; k < 3; k++) {
-      dx = fabs(ci->loc[k] - cj->loc[k]);
-
-      r2 += dx * dx;
-      r2_i += (dx - 0.5 * ci->h) * (dx - 0.5 * ci->h);
-      r2_j += (dx - 0.5 * cj->h) * (dx - 0.5 * cj->h);
-    }
-
-    /* Check whether we can use the multipoles. */
-    if ((dist_min * dist_min * r2_j > ci->h * ci->h) &&
-        (dist_min * dist_min * r2_i > cj->h * cj->h)) {
-      data[0] = ci;
-      data[1] = cj;
-      tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
-                           sizeof(struct cell *) * 2, ci->count);
-      qsched_addlock(s, tid, ci->res);
-      qsched_addunlock(s, cj->com_tid, tid);
-
-      data[0] = cj;
-      data[1] = ci;
-      tid = qsched_addtask(s, task_type_pair_pc, task_flag_none, data,
-                           sizeof(struct cell *) * 2, cj->count);
-      qsched_addlock(s, tid, cj->res);
-      qsched_addunlock(s, ci->com_tid, tid);
-
-      /* Otherwise, if neither cells are split, generate a part-part task. */
-    } else if (!ci->split && !cj->split) {
-
-      /* Set the data. */
-      data[0] = ci;
-      data[1] = cj;
-
-      /* Create the task. */
-      tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
-                           sizeof(struct cell *) * 2, ci->count * cj->count);
-
-      /* Add the resources. */
-      qsched_addlock(s, tid, ci->res);
-      qsched_addlock(s, tid, cj->res);
-      qsched_addunlock(s, ci->com_tid, tid);
-      qsched_addunlock(s, cj->com_tid, tid);
-
-      /* Otherwise, compute the interaction recursively over the progeny. */
-    } else if (ci->count > task_limit && cj->count > task_limit) {
-
-      /* We can split one of the two cells. Let's try the biggest one */
-      if (ci->h > cj->h) {
-
-        if (ci->split) {
-          for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling)
-            create_tasks(s, cp, cj);
-        } else if (cj->split) {
-          for (cp = cj->firstchild; cp != cj->sibling; cp = cp->sibling)
-            create_tasks(s, ci, cp);
-        }
-
-      } else {
-
-        if (cj->split) {
-          for (cp = cj->firstchild; cp != cj->sibling; cp = cp->sibling)
-            create_tasks(s, ci, cp);
-        } else if (ci->split) {
-          for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling)
-            create_tasks(s, cp, cj);
-        }
-      }
-
-      /* Create a task if too few particles */
-    } else {
-      /* Set the data. */
-      data[0] = ci;
-      data[1] = cj;
-
-      /* Create the task. */
-      tid = qsched_addtask(s, task_type_pair, task_flag_none, data,
-                           sizeof(struct cell *) * 2, ci->count * cj->count);
-
-      /* Add the resources. */
-      qsched_addlock(s, tid, ci->res);
-      qsched_addlock(s, tid, cj->res);
-
-      /* Depend on the COMs in case this task recurses. */
-      if (ci->split || cj->split) {
-        qsched_addunlock(s, ci->com_tid, tid);
-        qsched_addunlock(s, cj->com_tid, tid);
-      }
-    }
-
-  } /* otherwise, it's a pair. */
-}
-
-/* -------------------------------------------------------------------------- */
-/* Legacy tree walk */
-/* -------------------------------------------------------------------------- */
-
-/**
- * @brief Compute the center of mass of a given cell recursively.
- *
- * @param c The #cell.
- */
-void legacy_comp_com(struct cell *c, int *countCoMs) {
-
-  int k, count = c->count;
-  struct cell *cp;
-  struct part *p, *parts = c->parts;
-  double com[3] = {0.0, 0.0, 0.0}, mass = 0.0;
-
-  ++(*countCoMs);
-
-  /* Is the cell split? */
-  if (c->split) {
-
-    /* Loop over the progeny. */
-    for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
-      /* Recurse */
-      legacy_comp_com(cp, countCoMs);
-
-      /* Collect multipole information */
-      double cp_mass = cp->legacy.mass;
-      com[0] += cp->legacy.com[0] * cp_mass;
-      com[1] += cp->legacy.com[1] * cp_mass;
-      com[2] += cp->legacy.com[2] * cp_mass;
-      mass += cp_mass;
-    }
-
-    /* Otherwise, collect the multipole from local data. */
-  } else {
-
-    for (k = 0; k < count; k++) {
-      p = &parts[k];
-      double p_mass = p->mass;
-      com[0] += p->x[0] * p_mass;
-      com[1] += p->x[1] * p_mass;
-      com[2] += p->x[2] * p_mass;
-      mass += p_mass;
-    }
-  }
-
-  /* Finish multipole calculation */
-  if (mass > 0.0) {
-    double imass = 1.0 / mass;
-    c->legacy.com[0] = com[0] * imass;
-    c->legacy.com[1] = com[1] * imass;
-    c->legacy.com[2] = com[2] * imass;
-    c->legacy.mass = mass;
-  } else {
-    c->legacy.com[0] = 0.0;
-    c->legacy.com[1] = 0.0;
-    c->legacy.com[2] = 0.0;
-    c->legacy.mass = 0.0;
-  }
-
-}
-
-/**
- * @brief Interacts a particle with a cell recursively using the original B-H
- * tree walk procedure
- *
- * @param parts The array of particles
- * @param i The particle of interest
- * @param root The root of the tree under which we will search.
- * @param monitor If set to @c parts[i].id, will produce debug output when
- *        ICHECK is set.
- * @param cell The cell the particle interacts with
- */
-void legacy_interact(struct part *parts, int i, struct cell *root, int monitor,
-                     int *countMultipoles, int *countPairs) {
-
-  int j, k;
-  double r2, dx[3], ir, w;
-  double a[3] = {0.0, 0.0, 0.0};
-  double pix[3] = {parts[i].x[0], parts[i].x[1], parts[i].x[2]};
-  int pid = parts[i].id;
-  struct cell *cell = root;
-
-  /* Traverse the cells of the tree. */
-  while (cell != NULL) {
-
-    /* Are we in a leaf ? */
-    if (!cell->split) {
-
-      /* Interact the particle with the particles in the leaf */
-      for (j = 0; j < cell->count; ++j) {
-        if (cell->parts[j].id == pid) continue;
-
-#if ICHECK >= 0
-        if (pid == monitor)
-          message("[BH_] Interaction with particle id= %d", cell->parts[j].id);
-#endif
-
-        /* Compute the pairwise distance. */
-        for (r2 = 0.0, k = 0; k < 3; k++) {
-          dx[k] = cell->parts[j].x[k] - pix[k];
-          r2 += dx[k] * dx[k];
-        }
-
-        /* Apply the gravitational acceleration. */
-        ir = 1.0 / sqrt(r2);
-        w = cell->parts[j].mass * const_G * ir * ir * ir;
-        for (k = 0; k < 3; k++) a[k] += w * dx[k];
-
-        (*countPairs)++;
-      }
-
-      cell = cell->sibling;
-    } else {
-
-      /* We are in a node */
-      for (r2 = 0.0, k = 0; k < 3; k++) {
-        dx[k] = cell->legacy.com[k] - pix[k];
-        r2 += dx[k] * dx[k];
-      }
-
-#if ICHECK >= 0
-      if (pid == monitor)
-        message("This is a node with %d particles h= %f. r= %f theta= %f",
-                cell->count, cell->h, sqrt(r2), dist_min);
-#endif
-
-      /* Is the cell far enough ? */
-      if (dist_min * dist_min * r2 < cell->h * cell->h) {
-
-#if ICHECK >= 0
-        if (pid == monitor) printf("Recursing...\n");
-#endif
-        cell = cell->firstchild;
-        continue;
-      }
-
-#if ICHECK >= 0
-      if (pid == monitor)
-        message("[BH_] Can interact with the monopole. x= %f %f %f m= %f h= %f",
-                cell->legacy.com[0], cell->legacy.com[1], cell->legacy.com[2],
-                cell->legacy.mass, cell->h);
-#endif
-
-      /* Apply the gravitational acceleration. */
-      ir = 1.0 / sqrt(r2);
-      w = cell->legacy.mass * const_G * ir * ir * ir;
-      for (k = 0; k < 3; k++) a[k] += w * dx[k];
-
-      (*countMultipoles)++;
-
-      /* Move to the next node */
-      cell = cell->sibling;
-    }
-  }
-
-  /* Store the locally computed acceleration back to the particle. */
-  for (k = 0; k < 3; k++) parts[i].a_legacy[k] += a[k];
-}
-
-/**
- * @brief Does a tree walk as in the B-H original work for all particles
- *
- * @param N The number of particles
- * @param parts The array of particles
- * @param root The root cell of the tree
- * @param monitor ID of the particle to monitor and output interactions to
- *        stdout
- */
-void legacy_tree_walk(int N, struct part *parts, struct cell *root, int monitor,
-                      int *countMultipoles, int *countPairs, int *countCoMs) {
-
-  int i;
-
-  /* Compute multipoles (recursively) */
-  legacy_comp_com(root, countCoMs);
-
-  //#pragma omp parallel for
-  for (i = 0; i < N; ++i) {
-    if (parts[i].id == monitor)
-      message("tree walk for particle %d x= %f %f %f", parts[i].id,
-              parts[i].x[0], parts[i].x[1], parts[i].x[2]);
-
-    legacy_interact(parts, i, root, monitor, countMultipoles, countPairs);
-
-    if (parts[i].id == monitor)
-      message("\n[LEGACY] acceleration for particle %d a= %f %f %f",
-              parts[i].id, parts[i].a_legacy[0], parts[i].a_legacy[1],
-              parts[i].a_legacy[2]);
-  }
-}
-
-/* -------------------------------------------------------------------------- */
-/* Exact interaction */
-/* -------------------------------------------------------------------------- */
-
-/**
- * @brief Solve the particle interactions using the stupid N^2 algorithm
- *
- * @param N The number of particles
- * @param parts The array of particles
- */
-void interact_exact(int N, struct part *parts, int monitor) {
-
-  int i, j, k;
-  double ir, w, r2, dx[3];
-
-  /* Loop over all particles. */
-  for (i = 0; i < N; ++i) {
-
-    /* Some things to store locally. */
-    double pix[3] = {parts[i].x[0], parts[i].x[1], parts[i].x[2]};
-    double mi = parts[i].mass;
-
-    /* Loop over every other particle. */
-    for (j = i + 1; j < N; ++j) {
-
-      /* Compute the pairwise distance. */
-      for (r2 = 0.0, k = 0; k < 3; k++) {
-        dx[k] = parts[j].x[k] - pix[k];
-        r2 += dx[k] * dx[k];
-      }
-
-      /* Apply the gravitational acceleration. */
-      ir = 1.0 / sqrt(r2);
-      w = const_G * ir * ir * ir;
-
-      for (k = 0; k < 3; k++) {
-        double wdx = w * dx[k];
-        parts[j].a_exact[k] -= wdx * mi;
-        parts[i].a_exact[k] += wdx * parts[j].mass;
-      }
-    }
-
-  }
-
-  for (i = 0; i < N; ++i)
-    if (parts[i].id == monitor)
-      message("[EXACT ] acceleration for particle %d a= %f %f %f\n",
-              parts[i].id, parts[i].a_exact[0], parts[i].a_exact[1],
-              parts[i].a_exact[2]);
-}
-
-/**
- * @brief Set up and run a task-based Barnes-Hutt N-body solver.
- *
- * @param N The number of random particles to use.
- * @param nr_threads Number of threads to use.
- * @param runs Number of force evaluations to use as a benchmark.
- * @param fileName Input file name. If @c NULL or an empty string, random
- *        particle positions will be used.
- */
-void test_bh(int N, int nr_threads, int runs, char *fileName) {
-
-  int i, k;
-  struct cell *root;
-  struct part *parts;
-  FILE *file;
-  struct qsched s;
-  ticks tic, toc_run, tot_setup = 0, tot_run = 0, tic_exact, toc_exact;
-  int countMultipoles, countPairs, countCoMs;
-
-  /* Runner function. */
-  void runner(int type, void * data) {
-
-    ticks tic = getticks();
-
-    /* Decode the data. */
-    struct cell **d = (struct cell **)data;
-
-    /* Decode and execute the task. */
-    switch (type) {
-      case task_type_self:
-        iact_self(d[0]);
-        break;
-      case task_type_pair:
-        iact_pair(d[0], d[1]);
-        break;
-      case task_type_pair_pc:
-        iact_pair_pc(d[0], d[1]);
-        break;
-      case task_type_com:
-        comp_com(d[0]);
-        break;
-      default:
-        error("Unknown task type.");
-    }
-
-    atomic_add(&task_timers[type], getticks() - tic);
-  }
-
-  /* Initialize the per-task type timers. */
-  for (k = 0; k < task_type_count; k++) task_timers[k] = 0;
-
-  /* Initialize the scheduler. */
-  qsched_init(&s, nr_threads, qsched_flag_noreown);
-
-  /* Init and fill the particle array. */
-  if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL)
-    error("Failed to allocate particle buffer.");
-
-  /* If no input file was specified, generate random particle positions. */
-  if (fileName == NULL || fileName[0] == 0) {
-    for (k = 0; k < N; k++) {
-      parts[k].id = k;
-      parts[k].x[0] = ((double)rand()) / RAND_MAX;
-      parts[k].x[1] = ((double)rand()) / RAND_MAX;
-      parts[k].x[2] = ((double)rand()) / RAND_MAX;
-      parts[k].mass = ((double)rand()) / RAND_MAX;
-      parts[k].a_legacy[0] = 0.0;
-      parts[k].a_legacy[1] = 0.0;
-      parts[k].a_legacy[2] = 0.0;
-    }
-
-    /* Otherwise, read them from a file. */
-  } else {
-    file = fopen(fileName, "r");
-    if (file) {
-      for (k = 0; k < N; k++) {
-        if (fscanf(file, "%d", &parts[k].id) != 1)
-          error("Failed to read ID of part %i.", k);
-        if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1],
-                   &parts[k].x[2]) !=
-            3)
-          error("Failed to read position of part %i.", k);
-        if (fscanf(file, "%lf", &parts[k].mass) != 1)
-          error("Failed to read mass of part %i.", k);
-      }
-      fclose(file);
-    }
-  }
-
-  /* Init the cells. */
-  root = cell_get();
-  root->loc[0] = 0.0;
-  root->loc[1] = 0.0;
-  root->loc[2] = 0.0;
-  root->h = 1.0;
-  root->count = N;
-  root->parts = parts;
-  cell_split(root, &s);
-
-  printf("----------------------------------------------------------\n");
-
-  /* Do a N^2 interactions calculation */
-
-  tic_exact = getticks();
-  //interact_exact( N , parts , ICHECK );
-  toc_exact = getticks();
-
-  printf("Exact calculation (1 thread) took %lli (= %e) ticks\n",
-         toc_exact - tic_exact, (float)(toc_exact - tic_exact));
-
-  printf("----------------------------------------------------------\n");
-
-  /* Create the tasks. */
-  tic = getticks();
-  create_tasks(&s, root, NULL);
-  tot_setup += getticks() - tic;
-
-  /* Dump the number of tasks. */
-  message("total nr of tasks: %i.", s.count);
-  message("total nr of deps: %i.", s.count_deps);
-  message("total nr of res: %i.", s.count_res);
-  message("total nr of locks: %i.", s.count_locks);
-  message("total nr of uses: %i.", s.count_uses);
-  int counts[task_type_count];
-  for (k = 0; k < task_type_count; k++) counts[k] = 0;
-  for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1;
-
-  char buffer[200];
-  sprintf(buffer, "timings_legacy_%d_%d.dat", cell_maxparts, nr_threads);
-  FILE *fileTime = fopen(buffer, "w");
-
-  /* Loop over the number of runs. */
-  for (k = 0; k < runs; k++) {
-
-    countMultipoles = 0;
-    countPairs = 0;
-    countCoMs = 0;
-
-    /* Execute the legacy walk. */
-    tic = getticks();
-    legacy_tree_walk(N, parts, root, ICHECK, &countMultipoles, &countPairs,
-                     &countCoMs);
-    toc_run = getticks();
-
-    /* Dump some timings. */
-    message("%ith run took %lli (= %e) ticks...", k, toc_run - tic,
-            (float)(toc_run - tic));
-    tot_run += toc_run - tic;
-    fprintf(fileTime, "%lli %e\n", toc_run - tic, (float)(toc_run - tic));
-  }
-
-  fclose(fileTime);
-
-  printf("task counts: [ %8s %8s %8s %8s ]\n", "self", "direct", "m-poles",
-         "CoMs");
-  printf("task counts: [ %8i %8i %8i %8i ] (legacy).\n", 0, countPairs,
-         countMultipoles, countCoMs);
-  printf("task counts: [ ");
-  for (k = 0; k < task_type_count; k++) printf("%8i ", counts[k]);
-  printf("] (new).\n");
-
-  /* Loop over the number of runs. */
-  for (k = 0; k < runs; k++) {
-
-    for (i = 0; i < N; ++i) {
-      parts[i].a[0] = 0.;
-      parts[i].a[1] = 0.;
-      parts[i].a[2] = 0.;
-    }
-
-    /* Execute the tasks. */
-    tic = getticks();
-    qsched_run(&s, nr_threads, runner);
-    toc_run = getticks();
-    message("%ith run took %lli (= %e) ticks...", k, toc_run - tic,
-            (float)(toc_run - tic));
-    tot_run += toc_run - tic;
-  }
-
-  message("[check] root mass= %f %f", root->legacy.mass, root->new.mass);
-  message("[check] root CoMx= %f %f", root->legacy.com[0], root->new.com[0]);
-  message("[check] root CoMy= %f %f", root->legacy.com[1], root->new.com[1]);
-  message("[check] root CoMz= %f %f", root->legacy.com[2], root->new.com[2]);
-
-  /* Dump the tasks. */
-  /* for ( k = 0 ; k < s.count ; k++ ) */
-  /*     printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
-   * s.tasks[k].tic , s.tasks[k].toc ); */
-
-  /* Dump the costs. */
-  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
-          tot_run / runs);
-
-  /* Dump the timers. */
-  for (k = 0; k < qsched_timer_count; k++)
-    message("timer %s is %lli ticks.", qsched_timer_names[k],
-            s.timers[k] / runs);
-
-  /* Dump the per-task type timers. */
-  printf("task timers: [ ");
-  for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs);
-  printf("] ticks.\n");
-
-  /* Dump the particles to a file */
-  /*file = fopen("particle_dump.dat", "w");
-  fprintf(file, "# x y z a_exact.x   a_exact.y    a_exact.z    a_legacy.x    "
-                "a_legacy.y    a_legacy.z    a_new.x     a_new.y    a_new.z\n");
-  for (k = 0; k < N; ++k)
-    fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id,
-            parts[k].x[0], parts[k].x[1], parts[k].x[2], parts[k].a_exact[0],
-            parts[k].a_exact[1], parts[k].a_exact[2], parts[k].a_legacy[0],
-            parts[k].a_legacy[1], parts[k].a_legacy[2], parts[k].a[0],
-            parts[k].a[1], parts[k].a[2]);
-  fclose(file);*/
-
-  /* Clean up. */
-  qsched_free(&s);
-}
-
-/**
- * @brief Main function.
- */
-
-int main(int argc, char *argv[]) {
-
-  int c, nr_threads;
-  int N = 1000, runs = 1;
-  char fileName[100] = {0};
-
-/* Get the number of threads. */
-#pragma omp parallel shared(nr_threads)
-  {
-    if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
-  }
-
-  /* Parse the options */
-  while ((c = getopt(argc, argv, "n:r:t:f:")) != -1) switch (c) {
-      case 'n':
-        if (sscanf(optarg, "%d", &N) != 1)
-          error("Error parsing number of particles.");
-        break;
-      case 'r':
-        if (sscanf(optarg, "%d", &runs) != 1)
-          error("Error parsing number of runs.");
-        break;
-      case 't':
-        if (sscanf(optarg, "%d", &nr_threads) != 1)
-          error("Error parsing number of threads.");
-        omp_set_num_threads(nr_threads);
-        break;
-      case 'f':
-        if (sscanf(optarg, "%s", &fileName[0]) != 1)
-          error("Error parsing file name.");
-        break;
-      case '?':
-        fprintf(stderr,
-                "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file]\n",
-                argv[0]);
-        fprintf(stderr, "Solves the N-body problem using a Barnes-Hutt\n"
-                        "tree code with N random particles read from a file in "
-                        "[0,1]^3 using\n"
-                        "nr_threads threads.\n");
-        exit(EXIT_FAILURE);
-    }
-
-  /* Tree node information */
-  printf("Size of cell: %zu bytes.\n", sizeof(struct cell));
-
-  /* Dump arguments. */
-  if (fileName[0] == 0) {
-    message("Computing the N-body problem over %i random particles using %i "
-            "threads (%i runs).",
-            N, nr_threads, runs);
-  } else {
-    message("Computing the N-body problem over %i particles read from '%s' "
-            "using %i threads (%i runs).",
-            N, fileName, nr_threads, runs);
-  }
-
-  /* Run the test. */
-  test_bh(N, nr_threads, runs, fileName);
-
-  return 0;
-}
diff --git a/examples/test_bh_sorted.c b/examples/test_bh_sorted.c
index fbdf55acb1a11b844df24be92356952e6ba8192b..1e596813280a5c7265bc9eb3e0f65730d2b29357 100644
--- a/examples/test_bh_sorted.c
+++ b/examples/test_bh_sorted.c
@@ -42,7 +42,7 @@
 #define const_G 1    // 6.6738e-8
 #define dist_min 0.5 /* Used for legacy walk only */
 #define dist_cutoff_ratio 1.5
-#define iact_pair_direct iact_pair_direct_sorted_dipole
+#define iact_pair_direct iact_pair_direct_sorted
 
 #define ICHECK -1
 #define NO_SANITY_CHECKS
@@ -116,7 +116,7 @@ struct dipole {
   float axis[3];
   double x[3];
   float mass;
-  double da, da2;
+  float da, da2;
 };
 
 static inline void dipole_init(struct dipole *d, const float *axis) {
@@ -164,7 +164,7 @@ static inline void dipole_iact(const struct dipole *d, const double *x,
 
   /* Compute the first dipole. */
   for (r2 = 0.0f, k = 0; k < 3; k++) {
-    dx[k] = x[k] - d->x[k] * inv_mass + offset * d->axis[k];
+    dx[k] = x[k] - (d->x[k] * inv_mass + offset * d->axis[k]);
     r2 += dx[k] * dx[k];
   }
   ir = 1.0f / sqrtf(r2);
@@ -173,7 +173,7 @@ static inline void dipole_iact(const struct dipole *d, const double *x,
 
   /* Compute the second dipole. */
   for (r2 = 0.0f, k = 0; k < 3; k++) {
-    dx[k] = x[k] - d->x[k] * inv_mass - offset * d->axis[k];
+    dx[k] = x[k] - (d->x[k] * inv_mass - offset * d->axis[k]);
     r2 += dx[k] * dx[k];
   }
   ir = 1.0f / sqrtf(r2);
@@ -377,7 +377,7 @@ void indices_sort(struct index *sort, int N) {
  * @param axis The normalized axis along which to sort.
  * @param aid The axis ID at which to store the indices.
  */
-void cell_sort(struct cell *c, float *axis, int aid) {
+void cell_sort(struct cell *c, const float *axis, int aid) {
 
   /* Has the indices array even been allocated? */
   if (c->indices == NULL) {
@@ -1609,6 +1609,16 @@ static inline void iact_pair_direct_sorted_dipole(struct cell *ci,
     for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k];
 
   } /* loop over all particles in ci. */
+  
+  /* Dump extensive info on the dipole. */
+  /* message("dip[0] has x=[%e,%e,%e], axis=[%e,%e,%e], da=%e, da2=%e, mass=%e.",
+    dip[0].x[0], dip[0].x[1], dip[0].x[2],
+    dip[0].axis[0], dip[0].axis[1], dip[0].axis[2],
+    dip[0].da, dip[0].da2, dip[0].mass);
+  for (j = count_j; j < cj->count; j++) {
+    message("   particle x=[%e,%e,%e], mass=%e.",
+      parts_j[j].x[0], parts_j[j].x[1], parts_j[j].x[2], parts_j[j].mass);
+  } */
 
   /* Re-init some values. */
   count_j = cj->count;
diff --git a/examples/test_qr_ompss.c b/examples/test_qr_ompss.c
index 8d9286775f01d4ac3b8514f6a657c20f2eede68f..475e892376f6412dd5a71c2de5c10a49793db6c9 100644
--- a/examples/test_qr_ompss.c
+++ b/examples/test_qr_ompss.c
@@ -417,6 +417,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs) {
   int k, j, i, r;
   double* A, *A_orig, *tau;
   ticks tic, toc_run, tot_run = 0;
+  double tid[ m * n ];
 
   /* Allocate and fill the original matrix. */
   if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
@@ -539,7 +540,7 @@ int main(int argc, char* argv[]) {
   /* Dump arguments. */
   message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
           "threads (%i runs).",
-          32 * M, 32 * N, nr_threads, runs);
+          K * M, K * N, nr_threads, runs);
 
   /* Initialize the timers. */
   if ((timers = (struct timer*)malloc(sizeof(struct timer) * M * M * N)) ==