diff --git a/Makefile.am b/Makefile.am
index 09288605fdb7ff0e34ddac06a4d0e0177bfb725b..507353ba2432238cafaacd7c0ec7f12a13e374a5 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -19,4 +19,4 @@
 ACLOCAL_AMFLAGS = -I m4
 
 # Show the way...
-SUBDIRS = src examples doc
+SUBDIRS = src examples doc tests
diff --git a/configure.ac b/configure.ac
index c5f61aa17fd34adee772b1b3c17035df6fc63dad..96e0cab2ad4fc57ddcd12117f9733558fbf6de42 100644
--- a/configure.ac
+++ b/configure.ac
@@ -97,6 +97,31 @@ if test "$enable_mpi" = "yes"; then
 fi
 AM_CONDITIONAL([HAVEMPI],[test -n "$MPICC"])
 
+# Check for metis. Note AX_LIB_METIS exists, but cannot be configured
+# to be default off (i.e. given no option it tries to locate METIS), so we
+# don't use that.
+AC_ARG_WITH([metis],
+    [AS_HELP_STRING([--with-metis=PATH],
+       [prefix where the metis library is installed @<:@default=yes@:>@]
+    )],
+    [],
+    [with_metis="no"]
+)
+if test "x$with_metis" != "xno"; then
+   if test "x$with_metis" != "xyes" -a "x$with_metis" != "x"; then
+      METIS_LIBS="-L$with_metis -lmetis"
+   else
+      METIS_LIBS="-lmetis"
+   fi
+   AC_CHECK_LIB([metis],[METIS_PartGraphKway],
+      AC_DEFINE([HAVE_METIS],1,[The metis library appears to be present.]),
+      AC_MSG_ERROR(something is wrong with the metis library!),$METIS_LIBS)
+fi
+AC_SUBST([METIS_LIBS])
+AM_CONDITIONAL([HAVEMETIS],[test -n "$METIS_LIBS"])
+
+
+
 # autoconf stuff
 AC_PROG_INSTALL
 AC_PROG_MAKE_SET
@@ -138,7 +163,7 @@ AC_MSG_RESULT($rtc_ok)
 DX_INIT_DOXYGEN(libquicksched,doc/Doxyfile,doc/)
 
 # .in files.
-AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile])
+AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile])
 
 # generate output.
 AC_OUTPUT
diff --git a/examples/Makefile.am b/examples/Makefile.am
index 5d9d3f87b8c131acac2c9e3d28e82893a7ff4efc..52663d20172ceede87305c7df352f377f5fe2839 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -20,7 +20,7 @@
 AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \
     -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
     -DCPU_TPS=2.67e9 -DTIMERS -std=gnu99 \
-    # -fsanitize=address -fno-omit-frame-pointer
+     -fsanitize=address -fno-omit-frame-pointer
 # AM_CFLAGS = -g -O0 -Wall -Werror -I../src \
 #     -DCPU_TPS=2.67e9 -DTIMERS $(OPENMP_CFLAGS) \
 #     -fsanitize=address -fno-omit-frame-pointer
diff --git a/src/Makefile.am b/src/Makefile.am
index 6573232798c78acacb7a726b20de1398103653a3..a2accbdbbc3be718767cf336ca98e8339147e724 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -18,7 +18,7 @@
 # Add the debug flag to the whole thing
 AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) -DTIMERS -std=gnu99 \
-    # -fsanitize=address -fno-omit-frame-pointer
+     -fsanitize=address -fno-omit-frame-pointer
 
 # Assign a "safe" version number
 AM_LDFLAGS = -version-info 0:0:0
@@ -26,10 +26,15 @@ AM_LDFLAGS = -version-info 0:0:0
 # Build the libquicksched library
 lib_LTLIBRARIES = libquicksched.la
 
+METIS_LIBS = @METIS_LIBS@
+MPI_THREAD_LIBS = @MPI_THREAD_LIBS@
+MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
+
 if HAVEMPI
 lib_LTLIBRARIES += libquickschedMPI.la
 endif
 
+libquicksched_la_LDFLAGS = -lmetis
 libquicksched_la_SOURCES = qsched.c queue.c
 
 #libquickschedMPI_la_CC = mpicc
diff --git a/src/qsched.c b/src/qsched.c
index eb7e2ff6393399913489c64d36796737b82e4cfb..10c0d64a099784e1a04e591697bdc4d83088488e 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include <string.h>
 
+
 /* OpenMP headers, only if available. */
 #ifdef HAVE_OPENMP
     #include <omp.h>
@@ -49,8 +50,14 @@
 #include "qsched.h"
 #include "queue.h"
 
+#ifdef HAVE_METIS
+#ifdef WITH_MPI
+#include <metis.h>
+#endif
+#endif
+
 #ifdef WITH_MPI
-inline int getindex(long long int    id , struct qsched *s)
+inline int getindex(long long int id , struct qsched *s)
 {
     return s->res_ranks[id>>48] + (id & 0xFFFFFFFFFFFFFF);
 }
@@ -313,6 +320,756 @@ void qsched_reset ( struct qsched *s ) {
     
     }
 
+void qsched_sync_schedulers( struct qsched *s){
+#ifndef WITH_MPI
+    error("Quicksched wasn't compiled with MPI support");
+#else
+
+    int i;//, j, k;
+
+    /* Synchronize the tasks. */
+    s->task_ranks[s->rank+1] = s->count;
+    MPI_Allreduce(MPI_IN_PLACE, s->task_ranks, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+    s->task_ranks[0] = 0;
+
+    //s->task_ranks[num_ranks] should be the total number of resources.
+    for(i = 1; i < s->count_ranks+1; i++){
+        s->task_ranks[i] += s->task_ranks[i-1];
+    }
+
+    struct task *tasks_new = (struct task*) calloc(s->task_ranks[s->count_ranks] , sizeof(struct task));
+    struct task *tasks_local = &tasks_new[s->task_ranks[s->rank]];
+    for(i = 0; i < s->count; i++)
+    {
+        tasks_local[i] = s->tasks[i];
+    }
+    int number = sizeof(struct task) * s->task_ranks[s->count_ranks];
+    number = number / sizeof(int);
+    MPI_Allreduce(MPI_IN_PLACE, tasks_new, number, MPI_INT, MPI_SUM, s->comm);
+    
+    free(s->tasks);
+    s->tasks = tasks_new;
+
+    int *temp;
+    temp = (int*) calloc( sizeof(int) , s->count_ranks+1 );
+    temp[s->rank+1] = s->count_locks;
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+    
+    for(i = 1; i < s->count_ranks+1; i++){
+        temp[i] += temp[i-1];
+    }
+
+
+    /* Synchronize the locks. */
+    long long int *locks_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *locks_key_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *locks_local = &locks_new[temp[s->rank]];
+    long long int *locks_key_local = &locks_key_new[temp[s->rank]];
+    for(i = 0; i < s->count_locks; i++)
+    {
+        locks_local[i] = s->locks[i];
+        locks_key_local[i] = s->locks_key[i];
+    }
+    int size = sizeof(long long int) * temp[s->count_ranks];
+    size = size / sizeof(int);
+
+    MPI_Allreduce(MPI_IN_PLACE, locks_new, size, MPI_INT, MPI_SUM, s->comm);
+    MPI_Allreduce(MPI_IN_PLACE, locks_key_new, size, MPI_INT, MPI_SUM, s->comm);
+
+    free(s->locks);
+    free(s->locks_key);
+    s->locks = locks_new;
+    s->locks_key = locks_key_new; 
+    s->count_locks = temp[s->count_ranks];
+
+
+    /* Synchronize the deps. */
+    memset(temp, 0, sizeof(int) * s->count_ranks+1);
+    temp[s->rank+1] = s->count_deps;
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+    
+    for(i = 1; i < s->count_ranks+1; i++){
+        temp[i] += temp[i-1];
+    }
+
+    long long int *deps_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *deps_key_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *deps_local = &locks_new[temp[s->rank]];
+    long long int *deps_key_local = &locks_key_new[temp[s->rank]];
+    for(i = 0; i < s->count_deps; i++)
+    {
+        deps_local[i] = s->deps[i];
+        deps_key_local[i] = s->deps_key[i];
+    }
+
+    size = sizeof(long long int) * temp[s->count_ranks];
+    size = size / sizeof(int);
+
+    MPI_Allreduce(MPI_IN_PLACE, deps_new, size, MPI_INT, MPI_SUM, s->comm);
+    MPI_Allreduce(MPI_IN_PLACE, deps_key_new, size, MPI_INT, MPI_SUM, s->comm);
+
+    free(s->deps);
+    free(s->deps_key);
+    s->deps = deps_new;
+    s->deps_key = deps_key_new;
+    s->count_deps = temp[s->count_ranks];
+
+
+   /* Synchronize the uses. */
+    memset(temp, 0, sizeof(int) * s->count_ranks+1);
+    temp[s->rank+1] = s->count_uses;
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+
+    for(i = 1; i < s->count_ranks+1; i++){
+        temp[i] += temp[i-1];
+    }
+
+    long long int *uses_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *uses_key_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *uses_local = &uses_new[temp[s->rank]];
+    long long int *uses_key_local = &uses_key_new[temp[s->rank]];
+
+    for(i = 0; i < s->count_uses; i++)
+    {
+        uses_local[i] = s->uses[i];
+        uses_key_local[i] = s->uses_key[i];
+    }
+    size = sizeof(long long int) * temp[s->count_ranks];
+    size = size / sizeof(int);
+    MPI_Allreduce(MPI_IN_PLACE, uses_new, size, MPI_INT, MPI_SUM, s->comm);
+    MPI_Allreduce(MPI_IN_PLACE, uses_key_new, size, MPI_INT, MPI_SUM, s->comm);
+    
+    free(s->uses);
+    free(s->uses_key);
+    s->uses = uses_new;
+    s->uses_key = uses_key_new;
+    s->count_uses = temp[s->count_ranks];
+
+/* Synchronize the lockers. */
+    memset(temp, 0, sizeof(int) * s->count_ranks+1);
+    temp[s->rank+1] = s->count_lockers;
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+    for(i = 1; i < s->count_ranks+1; i++){
+        temp[i] += temp[i-1];
+    }
+
+    long long int *lockers_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *lockers_key_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *lockers_local = &lockers_new[temp[s->rank]];
+    long long int *lockers_key_local = &lockers_key_new[temp[s->rank]];
+
+    for(i = 0; i < s->count_lockers; i++)
+    {
+        lockers_local[i] = s->lockers[i];
+        lockers_key_local[i] = s->lockers_key[i];
+    }
+    size = sizeof(long long int) * temp[s->count_ranks];
+    size = size / sizeof(int);
+    MPI_Allreduce(MPI_IN_PLACE, lockers_new, size, MPI_INT, MPI_SUM, s->comm);
+    MPI_Allreduce(MPI_IN_PLACE, lockers_key_new, size, MPI_INT, MPI_SUM, s->comm);
+    
+    free(s->lockers);
+    free(s->lockers_key);
+    s->lockers = lockers_new;
+    s->lockers_key = lockers_key_new;
+    s->count_lockers = temp[s->count_ranks];
+
+/* Synchronize the users. */
+    memset(temp, 0, sizeof(int) * s->count_ranks+1);
+    temp[s->rank+1] = s->count_users;
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+
+for(i = 1; i < s->count_ranks+1; i++){
+        temp[i] += temp[i-1];
+    }
+
+    long long int *users_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *users_key_new = (long long int*) calloc(temp[s->count_ranks], sizeof(long long int));
+    long long int *users_local = &users_new[temp[s->rank]];
+    long long int *users_key_local = &users_key_new[temp[s->rank]];
+
+    for(i = 0; i < s->count_lockers; i++)
+    {
+        users_local[i] = s->users[i];
+        users_key_local[i] = s->users_key[i];
+    }
+    size = sizeof(long long int) * temp[s->count_ranks];
+    size = size / sizeof(int);
+    MPI_Allreduce(MPI_IN_PLACE, users_new, size, MPI_INT, MPI_SUM, s->comm);
+    MPI_Allreduce(MPI_IN_PLACE, users_key_new, size, MPI_INT, MPI_SUM, s->comm);
+    
+    free(s->users);
+    free(s->users_key);
+    s->users = users_new;
+    s->users_key = users_key_new;
+    s->count_users = temp[s->count_ranks];
+
+    free(temp);
+    //Synchronize the num_users and num_lockers.
+    temp = (int*) calloc( s->res_ranks[s->count_ranks], sizeof(int));
+
+    for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
+    {
+        temp[i] = s->res[i].num_users;
+    }
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->res_ranks[s->count_ranks], MPI_INT, MPI_SUM, s->comm);
+    for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
+    {
+        s->res[i].num_users = temp[i];
+    }
+
+
+    for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
+    {
+        temp[i] = s->res[i].num_lockers;
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, temp, s->res_ranks[s->count_ranks], MPI_INT, MPI_SUM, s->comm);
+    for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
+    {
+        s->res[i].num_lockers = temp[i];
+    }
+
+    free(temp);
+#endif
+
+}
+
+
+void qsched_partition( struct qsched *s){
+#ifdef WITH_MPI
+    int *res_costs;
+    int i, j, k, l;
+    struct task *t;
+    struct res *r;
+
+    res_costs = (int*) calloc(s->res_ranks[s->count_ranks], sizeof(int));
+
+    //Loop through local tasks and compute the costs of tasks associated with eaceh resource.
+    for(i = s->task_ranks[s->rank]; i < s->task_ranks[s->rank+1]; i++)
+    {
+        t = &s->tasks[i];
+        printf("i = %i\n", i);
+        printf("t->nr_locks = %i\n", t->nr_locks);
+        printf("t->locks = %p\n", t->locks);
+        for(j = 0; j < t->nr_locks; j++)
+        {
+            printf("getindex(t->locks[%i], s)\n", j);
+            res_costs[getindex(t->locks[j],s)] = t->cost;
+        }
+
+        for(j = 0; j < t->nr_uses; j++)
+        {
+            res_costs[getindex(t->uses[j],s)] = t->cost;
+        }
+    }   
+
+    //All reduce res_costs;
+    MPI_Allreduce(MPI_IN_PLACE, res_costs, s->res_ranks[s->count_ranks], MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    //Build a nodelist of the highest level hierarchical resources which are locked or used.
+    idx_t *nodelist;
+    idx_t node_count=0;
+    idx_t *noderef;
+    nodelist = (idx_t *) calloc( s->res_ranks[s->count_ranks], sizeof(idx_t) );
+    noderef = (idx_t *) calloc(s->res_ranks[s->count_ranks], sizeof(idx_t) );
+
+    int counter =0;
+    for(i = 0; i <s->res_ranks[s->count_ranks]; i++)
+    {
+        if(s->res[i].num_lockers > 0)
+        {
+            counter++;
+        }
+    }
+    printf("num_locked = %i\n", counter);
+
+ //Loop through the resources.
+    for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
+    {
+        //If the resources is locked (or used in the final version).
+        if(s->res[i].num_lockers > 0 || s->res[i].num_users > 0){
+            
+            r = &s->res[i];
+            //If it has a parent.
+            if(r->parent != -1)
+            {
+                //If the parent is locked then we don't need this.
+                if(s->res[getindex(r->parent,s)].num_lockers > 0 || s->res[getindex(r->parent,s)].num_users > 0)
+                {
+                    noderef[node_count] = -1;
+                    continue;
+                }else{
+                    //Else, recurse up, if any parents are locked then we don't need it.
+                    noderef[node_count] = -10;
+                    while(r->parent != -1)
+                    {
+                        r = getres(r->parent, s);
+                        if(s->res[getindex(r->parent,s)].num_lockers > 0 || s->res[getindex(r->parent,s)].num_users > 0)
+                        {
+                            noderef[node_count] = 0;
+                            break;
+                        }
+                    }
+                    //If none of the parents are locked, then add this to the nodelist.
+                    if(noderef[node_count] == -10)
+                    {   
+                        noderef[node_count] = i;
+                        nodelist[node_count++] = res_costs[i];
+                    }
+                }
+            }else{
+                //Else set noderef to the index i, and set the cost in the nodelist to the res_cost value.
+                noderef[node_count] = i;
+                nodelist[node_count++] = res_costs[i];
+            }
+
+        }else{
+            //Else do nothing.
+            noderef[node_count] = -1;
+        }
+
+    }
+
+    printf("node_count = %i\n", node_count);
+
+//Build an edgelist where edges are of weight += task->weight for each task that locks (and eventually uses) both. If noderef doesn't contain, recurse until we find the ones it does contain (yuck). Create an initial "complete" graph then delete unused edges.
+
+    idx_t *edgelist;
+    edgelist = (idx_t *) calloc( (node_count * (node_count+1))/2, sizeof(idx_t));
+
+    /* Loop through local tasks */
+    for(i = s->task_ranks[s->rank]; i < s->task_ranks[s->rank+1]; i++)
+    {
+        t = &s->tasks[i];
+//        printf("t->nr_locks = %i, t = %i\n", t->nr_locks, i);
+        for(j = 0; j < t->nr_locks; j++)
+        {
+            r = getres(t->locks[j], s);
+            struct res *r2 = r;
+            while(r2->parent != -1)
+            {
+                r2 = getres(r2->parent, s);
+                if(r2->num_lockers > 0)
+                    r = r2;
+            }
+            for(k = 0; k < t->nr_locks; k++)
+            {
+                if(j == k)
+                    continue;
+
+                r2 = getres(t->locks[k], s);
+                struct res *r3 = r2;
+                while(r3->parent != -1)
+                {
+                    r3 = getres(r3->parent, s);
+                    if(r3->num_lockers > 0)
+                        r2 = r3;
+                }
+
+                if(r == r2)
+                {
+                    continue;
+                }
+                int index1 = r - s->res;
+                int index2 = r2 - s->res;
+
+                int nodepos1 = -1, nodepos2 = -1;
+
+                for(l = 0; l < node_count; l++)
+                {
+                    if(noderef[l] == index1)
+                        nodepos1 = l;
+    
+                    if(noderef[l] == index2)
+                        nodepos2 = l;
+
+                }
+                if(nodepos1 < 0 || nodepos2 < 0){
+                        error("ERROR NODEPOS NOT VALID\n");
+                }
+                if(nodepos1 < nodepos2)
+                {
+//[nodepos1][nodepos2] is (node_count * nodepos1) + nodepos2 - ( ( nodepos1 * (nodepos+1) / 2) from https://jamesmccaffrey.wordpress.com/2010/05/14/converting-a-triangular-matrix-to-an-array/
+        
+                    edgelist[ (node_count * nodepos1) + nodepos2 - ( nodepos1 * (nodepos1+1) / 2)] += t->cost; 
+                }else{
+                    edgelist[ (node_count * nodepos2) + nodepos1 - ( nodepos2 * (nodepos2 + 1) / 2)] += t->cost;
+                }
+                
+            }
+
+            for(k = 0; k < t->nr_uses; k++)
+            {
+                r2 = getres(t->uses[k], s);
+                struct res *r3 = r2;
+                while(r3->parent != -1)
+                {
+                    r3 = getres(r3->parent, s);
+                    if(r3->num_users > 0)
+                        r2 = r3;
+                }
+                if(r == r2)
+                {
+                    continue;
+                }
+
+                int index1 = r - s->res;
+                int index2 = r2 - s->res;
+
+                int nodepos1 = -1, nodepos2 = -1;
+
+                for(l = 0; l < node_count; l++)
+                {
+                    if(noderef[l] == index1)
+                        nodepos1 = l;
+    
+                    if(noderef[l] == index2)
+                        nodepos2 = l;
+
+                }
+                if(nodepos1 < 0 || nodepos2 < 0){
+                        error("ERROR NODEPOS NOT VALID\n");
+                }
+                if(nodepos1 < nodepos2)
+                {
+//[nodepos1][nodepos2] is (node_count * nodepos1) + nodepos2 - ( ( nodepos1 * (nodepos+1) / 2) from https://jamesmccaffrey.wordpress.com/2010/05/14/converting-a-triangular-matrix-to-an-array/
+        
+                    edgelist[ (node_count * nodepos1) + nodepos2 - ( nodepos1 * (nodepos1+1) / 2)] += t->cost; 
+                }else{
+                    edgelist[ (node_count * nodepos2) + nodepos1 - ( nodepos2 * (nodepos2 + 1) / 2)] += t->cost;
+                }
+            }
+        }//Locks
+
+        for(j = 0; j < t->nr_uses; j++)
+        {
+            r = getres(t->uses[j], s);
+            struct res *r2 = r;
+            while(r2->parent != -1)
+            {
+                r2 = getres(r2->parent, s);
+                if(r2->num_users > 0)
+                    r = r2;
+            }
+            for(k = 0; k < t->nr_locks; k++)
+            {
+
+                r2 = getres(t->locks[k], s);
+                struct res *r3 = r2;
+                while(r3->parent != -1)
+                {
+                    r3 = getres(r3->parent, s);
+                    if(r3->num_lockers > 0)
+                        r2 = r3;
+                }
+
+                if(r == r2)
+                {
+                    continue;
+                }
+                int index1 = r - s->res;
+                int index2 = r2 - s->res;
+
+                int nodepos1 = -1, nodepos2 = -1;
+
+                for(l = 0; l < node_count; l++)
+                {
+                    if(noderef[l] == index1)
+                        nodepos1 = l;
+    
+                    if(noderef[l] == index2)
+                        nodepos2 = l;
+
+                }
+                if(nodepos1 < 0 || nodepos2 < 0){
+                        error("ERROR NODEPOS NOT VALID\n");
+                }
+                if(nodepos1 < nodepos2)
+                {
+//[nodepos1][nodepos2] is (node_count * nodepos1) + nodepos2 - ( ( nodepos1 * (nodepos+1) / 2) from https://jamesmccaffrey.wordpress.com/2010/05/14/converting-a-triangular-matrix-to-an-array/
+        
+                    edgelist[ (node_count * nodepos1) + nodepos2 - ( nodepos1 * (nodepos1+1) / 2)] += t->cost; 
+                }else{
+                    edgelist[ (node_count * nodepos2) + nodepos1 - ( nodepos2 * (nodepos2 + 1) / 2)] += t->cost;
+                }
+                
+            }
+
+            for(k = 0; k < t->nr_uses; k++)
+            {
+                if(j == k)
+                    continue;
+                r2 = getres(t->uses[k], s);
+                struct res *r3 = r2;
+                while(r3->parent != -1)
+                {
+                    r3 = getres(r3->parent, s);
+                    if(r3->num_users > 0)
+                        r2 = r3;
+                }
+                if(r == r2)
+                {
+                    continue;
+                }
+
+                int index1 = r - s->res;
+                int index2 = r2 - s->res;
+
+                int nodepos1 = -1, nodepos2 = -1;
+
+                for(l = 0; l < node_count; l++)
+                {
+                    if(noderef[l] == index1)
+                        nodepos1 = l;
+    
+                    if(noderef[l] == index2)
+                        nodepos2 = l;
+
+                }
+                if(nodepos1 < 0 || nodepos2 < 0){
+                        printf("ERROR NODEPOS NOT VALID\n");
+                        MPI_Finalize();
+                        exit(0);
+                }
+                if(nodepos1 < nodepos2)
+                {
+//[nodepos1][nodepos2] is (node_count * nodepos1) + nodepos2 - ( ( nodepos1 * (nodepos+1) / 2) from https://jamesmccaffrey.wordpress.com/2010/05/14/converting-a-triangular-matrix-to-an-array/
+        
+                    edgelist[ (node_count * nodepos1) + nodepos2 - ( nodepos1 * (nodepos1+1) / 2)] += t->cost; 
+                }else{
+                    edgelist[ (node_count * nodepos2) + nodepos1 - ( nodepos2 * (nodepos2 + 1) / 2)] += t->cost;
+                }
+            }
+        }//Uses
+    }
+
+MPI_Allreduce(MPI_IN_PLACE, edgelist, ((node_count * (node_count+1))/2), MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+        printf("[ ");
+    for(i = 0; i < ((node_count * (node_count+1))/2); i++)
+    {
+            printf("%i, ", edgelist[i]);
+    }
+        printf("]\n");
+
+    printf("nodelist = [ ");
+    for(i = 0; i < node_count; i++)
+    {
+        printf("%i, ", nodelist[i]);
+    }
+        printf("]\n");
+   
+    //Loop through edgelist and remove empty slots. Count how big it needs to be.
+    int edgelist_size = 0;
+    for(i = 0; i < ((node_count * (node_count+1))/2); i++)
+    {
+        if(edgelist[i] != 0)
+            edgelist_size++;
+    }
+
+    printf("edgelist_size = %i\n", edgelist_size);
+
+    //Make the new contiguous edge list.
+    idx_t *edgelist_new = malloc(sizeof(idx_t) * edgelist_size*2);
+    idx_t *edgelist_pos = malloc(sizeof(idx_t) * (node_count + 1));
+    edgelist_pos[0] = 0;
+    idx_t *edgelist_vwgt = malloc(sizeof(idx_t) * edgelist_size*2);
+    idx_t edgelist_count = 0;
+    for(i = 0; i < node_count; i++)
+    {
+        for(j = 0; j < node_count; j++)
+        {
+            if(i == j)
+                continue;
+            if(i < j && edgelist[ (node_count * i) + j - ( i * (i + 1) / 2)] != 0)
+            {
+                edgelist_new[edgelist_count] = j;
+                edgelist_vwgt[edgelist_count++ ] = edgelist[ (node_count * i) + j - ( i * (i + 1) / 2)];
+            }else if (j > i && edgelist[ (node_count * j) + i - ( j * (j + 1) / 2)] != 0){
+                edgelist_new[edgelist_count] = j;
+                edgelist_vwgt[edgelist_count++] = edgelist[ (node_count * j) + i - ( j * (j + 1) / 2)];
+            }
+            
+        }   
+        edgelist_pos[i+1] = edgelist_count;
+    }
+
+    /* Set the METIS options. */
+        idx_t options[METIS_NOPTIONS];
+        METIS_SetDefaultOptions( options );
+        options[ METIS_OPTION_OBJTYPE ] = METIS_OBJTYPE_CUT;
+        options[ METIS_OPTION_NUMBERING ] = 0;
+        options[ METIS_OPTION_CONTIG ] = 0; //TODO 1
+        options[ METIS_OPTION_NCUTS ] = 10;
+        options[ METIS_OPTION_NITER ] = 20;
+        options[ METIS_OPTION_UFACTOR ] = 10;
+    
+        idx_t one = 1;
+        idx_t objval =0;
+        idx_t *nodeIDs;
+        nodeIDs = (idx_t *)malloc( sizeof(idx_t) * node_count );
+       // printf("%i\n", node_count);
+//        printf("%i\n", s->count_ranks);
+        if( METIS_PartGraphKway(&node_count, &one, edgelist_pos, edgelist_new, nodelist, NULL, edgelist_vwgt, &s->count_ranks, NULL, NULL,options, &objval, nodeIDs) != METIS_OK)
+            error("Failed to partition\n");
+
+ 
+#else
+    error("Quicksched wasn't compiled with MPI support");
+#endif
+
+}
+
+void qsched_prepare_mpi( struct qsched *s){
+
+#ifdef WITH_MPI
+    int j, k, count;
+    struct task *t, *tasks;
+    
+    TIMER_TIC
+
+    /* Lock the sched. */
+    lock_lock( &s->lock );
+    
+    /* Get a pointer to the tasks, set the count. */
+    tasks = s->tasks;
+    count = s->count;
+    /* If the sched is dirty... */
+    if ( s->flags & qsched_flag_dirty ) {
+    
+        qsched_sync_schedulers( s );
+        tasks = s->tasks;        
+        count = s->count;
+        /* Do the sorts in parallel, if possible. */
+        #pragma omp parallel
+        {
+    
+            /* Sort the unlocks. */
+            #pragma omp single nowait
+            qsched_quicksort( s->deps , s->deps_key , s->count_deps , 0 , s->res[count-1].ID );
+
+            /* Sort the locks. */
+            #pragma omp single nowait
+            qsched_quicksort( s->locks , s->locks_key , s->count_locks , 0 , s->res[count-1].ID );
+
+            /* Sort the uses. */
+            #pragma omp single nowait
+            qsched_quicksort( s->uses , s->uses_key , s->count_uses , 0 , s->res[count-1].ID );
+
+            #pragma omp single nowait
+            qsched_quicksort( s->users, s->users_key, s->count_users, 0, s->res[count-1].ID);
+
+            #pragma omp single nowait
+            qsched_quicksort( s->lockers, s->lockers_key, s->count_lockers, 0, s->res[count-1].ID);
+            
+        }
+        /* Run through the tasks and link the locks and unlocks. Also link the users and lockers */
+        tasks[0].unlocks = s->deps;
+        tasks[0].locks = s->locks;
+        tasks[0].uses = s->uses;
+        s->res[0].users = s->users;
+        s->res[0].lockers = s->lockers;
+        for ( k = 1 ; k < s->task_ranks[s->count_ranks] ; k++ ) {
+            tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ];
+            tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ];
+            printf("tasks[%i].locks = %p\n",k, s->tasks[k].locks);
+            tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ];
+            }
+        for(k = 1; k < s->res_ranks[s->count_ranks]; k++){
+            s->res[k].users = &s->res[k-1].users[ s->res[k-1].num_users ];
+            s->res[k].lockers = &s->res[k-1].lockers[ s->res[k-1].num_lockers ];
+        }
+        
+        /* All cleaned-up now! */
+        s->flags &= ~qsched_flag_dirty;
+    
+        }
+    qsched_partition(s);
+    return;    
+    /* Init the queues. */
+    for ( k = 0 ; k < s->nr_queues ; k++ )
+        queue_init( &s->queues[k] , count );
+    
+    /* Run through the tasks and set the waits... */
+    for ( k = 0 ; k < count ; k++ ) {
+        t = &tasks[k];
+        if ( !( t->flags & task_flag_skip ) )
+            for ( j = 0 ; j < t->nr_unlocks ; j++ )
+                tasks[ t->unlocks[j] ].wait += 1;
+        }
+        
+    /* Sort the tasks topologically. */
+    long long int *tid = (long long int *)malloc( sizeof(long long int) * count );
+    for ( j = 0 , k = 0 ; k < count ; k++ )
+        if ( tasks[k].wait == 0 ) {
+            tid[j] = k;
+            j += 1;
+            }
+    int ready = j;
+    for ( k = 0 ; k < j ; k++ ) {
+        t = &tasks[ tid[k] ];
+        for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
+            if ( ( tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
+                tid[j] = t->unlocks[kk];
+                j += 1;
+                }
+        }
+    if ( k < count )
+        error( "Circular dependencies detected." );
+        
+    /* Run through the topologically sorted tasks backwards and
+       set their weights, re-setting the waits while we're at it. */
+    for ( k = count-1 ; k >= 0 ; k-- ) {
+        long long int maxweight = 0;
+        t = &tasks[ tid[k] ];
+        for ( j = 0 ; j < t->nr_unlocks ; j++ ) {
+            tasks[ t->unlocks[j] ].wait += 1;
+            if ( tasks[ t->unlocks[j] ].weight > maxweight )
+                maxweight = tasks[ t->unlocks[j] ].weight;
+            }
+        t->weight = t->cost + maxweight;
+        }
+
+    /* Run through the tasks and enqueue the non-waiting ones. */
+    for ( k = 0 ; k < ready ; k++ ) {
+        t = &tasks[tid[k]];
+        if ( t->wait == 0 && !( t->flags & task_flag_skip ) )
+            qsched_enqueue( s , t );
+        }
+        
+    /* Clean up. */
+    free( tid );
+        
+    /* Set the number of waiting tasks. */
+    s->waiting = count;
+        
+    /* Set the ready flag. */
+    s->flags |= qsched_flag_ready;
+
+    /* Unlock the sched. */
+    lock_unlock_blind( &s->lock );
+    
+    TIMER_TOC( s , qsched_timer_prepare );
+
+#else
+    error("Quicksched wasn't compiled with MPI support");
+#endif
+
+}
+
+
+void qsched_run_MPI ( struct qsched *s, int nr_threads, qsched_funtype fun ) {
+
+#if defined( HAVE_MPI )
+
+    /* Prepare the scheduler*/
+        qsched_prepare_mpi( s );
+#else
+
+    error("Quicksched wasn't compiled with MPI support");
+#endif
+
+}
 
 /**
  * @brief Execute all the tasks in the current scheduler using
@@ -651,7 +1408,7 @@ void qsched_done ( struct qsched *s , struct task *t ) {
 
     /* Release this task's locks. */
     for ( k = 0 ; k < t->nr_locks ; k++ )
-        qsched_unlockres_local( s , t->locks[k] );
+        qsched_unlockres( s , t->locks[k] );
     
     /* Loop over the task's unlocks... */
     for ( k = 0 ; k < t->nr_unlocks ; k++ ) {
@@ -693,40 +1450,40 @@ void qsched_done ( struct qsched *s , struct task *t ) {
  * @return @c 1 if the resource could be locked, @c 0 otherwise.
  */
  
-int qsched_lockres_local ( struct qsched *s , long long int rid ) {
+int qsched_lockres ( struct qsched *s , long long int rid ) {
 #ifdef WITH_MPI
     error("Don't use local functions with MPI setup.");
 #else
     int finger, finger2;
     
     /* Try to lock the root-level resource. */
-    if ( s->res[rid].hold || lock_trylock( &s->res[rid].lock ) )
+    if ( s->res[getindex(rid,s)].hold || lock_trylock( &s->res[getindex(rid,s)].lock ) )
         return 0;
         
     /* Did the resource get held in the meantime? */
-    if ( s->res[rid].hold ) {
-        lock_unlock_blind( &s->res[rid].lock );
+    if ( s->res[getindex(rid,s)].hold ) {
+        lock_unlock_blind( &s->res[getindex(rid,s)].lock );
         return 0;
         }
         
     /* Follow parents and increase their hold counter, but fail
        if any are locked. */
-    for ( finger = s->res[rid].parent ; finger != qsched_res_none ; finger = s->res[finger].parent ) {
-        if ( lock_trylock( &s->res[finger].lock ) )
+    for ( finger = s->res[getindex(rid,s)].parent ; finger != qsched_res_none ; finger = s->res[getindex(finger,s)].parent ) {
+        if ( lock_trylock( &s->res[getindex(finger,s)].lock ) )
             break;
-        atomic_inc( &s->res[finger].hold );
-        lock_unlock_blind( &s->res[finger].lock );
+        atomic_inc( &s->res[getindex(finger,s)].hold );
+        lock_unlock_blind( &s->res[getindex(finger,s)].lock );
         }
         
     /* Did we fail on the way up? */
     if ( finger != qsched_res_none ) {
     
         /* Unlock the resource. */
-        lock_unlock_blind( &s->res[rid].lock );
+        lock_unlock_blind( &s->res[getindex(rid,s)].lock );
     
         /* Go back up the tree and undo the holds. */
-        for ( finger2 = s->res[rid].parent ; finger2 != finger ; finger2 = s->res[finger2].parent )
-            atomic_dec( &s->res[finger2].hold );
+        for ( finger2 = s->res[getindex(rid,s)].parent ; finger2 != finger ; finger2 = s->res[getindex(finger2,s)].parent )
+            atomic_dec( &s->res[getindex(finger2,s)].hold );
             
         /* Fail. */
         return 0;
@@ -747,20 +1504,17 @@ int qsched_lockres_local ( struct qsched *s , long long int rid ) {
  * @param rid The ID of the resource to lock.
  */
  
-void qsched_unlockres_local ( struct qsched *s , long long int rid ) {
-#ifdef WITH_MPI
-    error("Don't use local functions with MPI setup.");
-#else
+void qsched_unlockres ( struct qsched *s , long long int rid ) {
+
     int finger;
         
     /* Unlock the resource. */
-    lock_unlock_blind( &s->res[rid].lock );
+    lock_unlock_blind( &s->res[getindex(rid,s)].lock );
 
     /* Go back up the tree and undo the holds. */
-    for ( finger = s->res[rid].parent ; finger != qsched_res_none ; finger = s->res[finger].parent )
-        atomic_dec( &s->res[finger].hold );
+    for ( finger = s->res[getindex(rid,s)].parent ; finger != qsched_res_none ; finger = s->res[getindex(finger,s)].parent )
+        atomic_dec( &s->res[getindex(finger,s)].hold );
             
-#endif
     }
     
     
@@ -773,21 +1527,18 @@ void qsched_unlockres_local ( struct qsched *s , long long int rid ) {
  * @return @c 1 if the resources could be locked, @c 0 otherwise.
  */
  
-int qsched_locktask_local ( struct qsched *s ,long long int tid ) {
-#ifdef WITH_MPI
-    error("Don't use local functions with MPI setup.");
-#else
+int qsched_locktask ( struct qsched *s ,long long int tid ) {
     int k;
     struct task *t;
     
     TIMER_TIC
 
     /* Get a pointer on the task. */
-    t = &s->tasks[tid];
+    t = &s->tasks[gettaskindex(tid,s)];
         
     /* Try to lock all the task's locks. */
     for ( k = 0 ; k < t->nr_locks ; k++ )
-        if ( qsched_lockres_local( s , t->locks[k] ) == 0 )
+        if ( qsched_lockres( s , t->locks[k] ) == 0 )
             break;
 
     /* If I didn't get all the locks... */
@@ -795,7 +1546,7 @@ int qsched_locktask_local ( struct qsched *s ,long long int tid ) {
 
         /* Unroll the locks I got. */
         for ( k -= 1 ; k >= 0 ; k-- )
-            qsched_unlockres_local( s , t->locks[k] );
+            qsched_unlockres( s , t->locks[k] );
 
         /* Fail. */
         TIMER_TOC( s , qsched_timer_lock )
@@ -808,7 +1559,7 @@ int qsched_locktask_local ( struct qsched *s ,long long int tid ) {
         TIMER_TOC( s , qsched_timer_lock )
         return 1;
         }
-#endif            
+          
     }
     
 
@@ -819,24 +1570,20 @@ int qsched_locktask_local ( struct qsched *s ,long long int tid ) {
  * @param tid The ID of the #task to unlock.
  */
  
-void qsched_unlocktask_local ( struct qsched *s , long long int tid ) {
-#ifdef WITH_MPI
-    error("Don't use local functions with MPI setup.");
-#else
+void qsched_unlocktask ( struct qsched *s , long long int tid ) {
     int k;
     struct task *t;
     
     TIMER_TIC
 
     /* Get a pointer on the task. */
-    t = &s->tasks[tid];
+    t = &s->tasks[gettaskindex(tid,s)];
         
     /* Unlock the used resources. */
     for ( k = 0 ; k < t->nr_locks ; k++ )
-        qsched_unlockres_local( s , t->locks[k] );
+        qsched_unlockres( s , t->locks[k] );
         
     TIMER_TOC( s , qsched_timer_lock )
-#endif
     }
 
 
@@ -1292,6 +2039,11 @@ long long int qsched_addres ( struct qsched *s, int owner, int size, void **data
 struct res *res_new;
     long long int id;
     
+    if( (s->flags & qsched_flag_ressync) != 0)
+    {
+        error("Resources have already been synchronised.");
+    }
+
     lock_lock( &s->lock);
 
     //Reallocate res array if neccessary.
@@ -1371,6 +2123,11 @@ long long int qsched_addchildres( struct qsched *s, long long int parent, int ow
     struct res *res_new;
     long long int id;
     
+    if( (s->flags & qsched_flag_ressync) != 0)
+    {
+        error("Resources have already been synchronised.");
+    }
+
     lock_lock( &s->lock);
     //Reallocate res array if neccessary.
     /* Do the deps need to be re-allocated? */
@@ -1598,6 +2355,7 @@ void qsched_addlock ( struct qsched *s , long long int t , long long int res ) {
         
         s->lockers = (long long int *) temp1;
         s->lockers_key = (long long int *) temp2;
+        s->res[getindex(res,s)].num_lockers += 1;
     }
     s->lockers[ s->count_lockers ] = t;
     s->lockers_key[ s->count_lockers ] = res;
@@ -1661,6 +2419,31 @@ void qsched_adduse ( struct qsched *s , long long int t , long long int res ) {
     s->uses_key[ s->count_uses ] = t;
     s->tasks[gettaskindex(t, s)].nr_uses += 1;
     
+    #ifdef WITH_MPI
+    /* Do the lockers need to be reallocated? */
+    if(s->count_users == s->size_users) {
+        s->size_users *= qsched_stretch;
+
+        if ( ( temp1 = malloc( sizeof(long long int) * s->size_users ) ) == NULL ||
+             ( temp2 = malloc( sizeof(long long int) * s->size_users ) ) == NULL )
+            error("Failed to allocate new lockers lists." );
+
+        memcpy( temp1, s->users, sizeof(long long int) * s->count_users );
+        memcpy( temp2, s->users_key, sizeof(long long int) * s->count_users );
+
+        /* Free the old lockers lists. */
+        free( s->users );
+        free( s->users_key );
+        
+        s->users = (long long int *) temp1;
+        s->users_key = (long long int *) temp2;
+    }
+    s->users[ s->count_users ] = t;
+    s->users_key[ s->count_users ] = res;
+    s->count_users += 1;
+    s->res[getindex(res,s)].num_users += 1;
+    #endif
+
     /* Increase the uses counter. */
     s->count_uses += 1;
     
@@ -1894,6 +2677,65 @@ void qsched_free ( struct qsched *s ) {
     }
 
 
+void qsched_sync_resources(struct qsched *s)
+{
+    #ifndef WITH_MPI
+    error("Quicksched wasn't compiled with MPI support.");
+    #else
+    int ierr, i;//,j,k;
+
+    lock_lock(&s->lock);
+
+    s->res_ranks[s->rank+1] = s->count_res;
+    ierr = MPI_Allreduce(MPI_IN_PLACE, s->res_ranks, s->count_ranks+1, MPI_INT, MPI_SUM, s->comm);
+    if(ierr != MPI_SUCCESS)
+    {
+        error("Failed to allreduce s->res_ranks");
+    }
+    s->res_ranks[0] = 0;
+
+    /* s->res_ranks[num_ranks] should be the total number of resources. */
+    for(i = 1; i < s->count_ranks+1; i++){
+        s->res_ranks[i] += s->res_ranks[i-1];
+    }
+
+    /* Create the array to hold all of the resources. */
+    struct res *res_new = (struct res*) calloc(s->res_ranks[s->count_ranks] , sizeof(struct res));
+    /* res_local contains the pointer to the resources created on this processor.*/
+    struct res *res_local = &res_new[s->res_ranks[s->rank]];
+    for(i = 0; i < s->count_res; i++)
+    {
+        res_local[i] = s->res[i];
+    }
+    /* Calculate the size of the res_new array with respect to sizeof(int)*/
+    int number = sizeof(struct res) * s->res_ranks[s->count_ranks];
+    number = number / sizeof(int);
+    MPI_Allreduce(MPI_IN_PLACE, res_new, number, MPI_INT, MPI_SUM, s->comm);
+
+    free(s->res);
+    s->res = res_new;
+
+    //Loop through all the resources and if they arne't on this node set data to NULL
+    for(i = 0; i < s->res_ranks[s->count_ranks]; i++)
+    {
+        if(s->res[i].ID>>48 != s->rank)
+            s->res[i].data = NULL;
+    }
+
+    /* Mark Scheduler as “MPI clean”. */
+    s->flags &= ~qsched_flag_mpidirty;
+    s->flags |= qsched_flag_ressync;
+    
+
+    lock_unlock_blind( &s->lock);
+
+
+    #endif
+}
+
+
+
+
 /**
  * @brief Initialize the given #qsched object.
  *
@@ -1904,11 +2746,25 @@ void qsched_free ( struct qsched *s ) {
  * Initializes the given #qsched with the given number of queues.
  */
  
-void qsched_init ( struct qsched *s , int nr_queues , int flags ) {
+#ifdef WITH_MPI
+void qsched_init ( struct qsched *s , int nr_queues , int flags , MPI_Comm comms){
+#else
+void qsched_init ( struct qsched *s , int nr_queues , int flags ){
+#endif
     
     /* Set the flags to begin with. */
     s->flags = flags;
     
+    #ifdef WITH_MPI
+        s->comm = comms;
+    MPI_Comm_rank(comms, &s->rank);
+
+    MPI_Comm_size(comms, &s->count_ranks);
+
+    s->res_ranks = (int*) calloc(s->count_ranks+1, sizeof(int));
+    s->task_ranks = (int*) calloc(s->count_ranks+1, sizeof(int));
+    #endif
+
     /* Allocate and clear the queues (will init when sched is
        finalized. */
     if ( ( s->queues = (struct queue *)malloc( sizeof(struct queue) * nr_queues ) ) == NULL )
@@ -1958,8 +2814,9 @@ void qsched_init ( struct qsched *s , int nr_queues , int flags ) {
 
     s->size_lockers = qsched_init_lockspertask * s->count_res;
     if ( ( s->lockers = (long long int *)malloc( sizeof(long long int) * s->size_lockers ) ) == NULL ||
-         ( s->lockers = (long long int *)malloc( sizeof(long long int) * s->size_lockers ) ) == NULL )
+         ( s->lockers_key = (long long int *)malloc( sizeof(long long int) * s->size_lockers ) ) == NULL )
         error( "Failed to allocate memory for lockers." );
+    s->count_lockers = 0;
 #endif
 
     /* Allocate the initial data. */
diff --git a/src/qsched.h b/src/qsched.h
index 6797c7bce6e8ee0b497e3712cb1aebbc697b3b81..9730f1194535fd044452401bb7d5457d2edf297b 100644
--- a/src/qsched.h
+++ b/src/qsched.h
@@ -26,6 +26,7 @@
 #define qsched_flag_noreown              16
 #define qsched_flag_norecost             32
 #define qsched_flag_mpidirty             64
+#define qsched_flag_ressync              128
 
 /* Some sched-specific constants. */
 #define qsched_stretch                   2
@@ -183,7 +184,7 @@ struct qsched {
     long long int *users_key;
     int size_users;
     int count_users;
-    /* MPICOMM TODO*/
+    MPI_Comm comm;
     #endif
     
     };
@@ -212,20 +213,28 @@ void qsched_sort_rec ( long long int *data , long long int *ind , int N , long l
 struct task *qsched_gettask ( struct qsched *s , int qid );
 void qsched_done ( struct qsched *s , struct task *t );
 void *qsched_getdata( struct qsched *s , struct task *t );
-int qsched_lockres_local ( struct qsched *s , long long int rid );
-void qsched_unlockres_local ( struct qsched *s , long long int rid );
-int qsched_locktask_local ( struct qsched *s , long long int tid );
-void qsched_unlocktask_local ( struct qsched *s , long long int tid );
+int qsched_lockres ( struct qsched *s , long long int rid );
+void qsched_unlockres ( struct qsched *s , long long int rid );
+int qsched_locktask ( struct qsched *s , long long int tid );
+void qsched_unlocktask ( struct qsched *s , long long int tid );
 void qsched_prepare ( struct qsched *s );
 void qsched_enqueue ( struct qsched *s , struct task *t );
 
 /* External functions for MPI. */
-qsched_res_t qsched_address (struct qsched *s, int owner, int size, void **data );
+qsched_res_t qsched_addres (struct qsched *s, int owner, int size, void **data );
 qsched_res_t qsched_addchildres( struct qsched *s, long long int parent, int owner, int size, int position, void **data);
 qsched_task_t qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *data , int data_size , int cost );
+void qsched_sync_resources(struct qsched *s);
+void qsched_sync_schedulers( struct qsched *s);
+void qsched_prepare_mpi( struct qsched *s);
+
 
 /* External functions. */
+#ifdef WITH_MPI
+void qsched_init ( struct qsched *s , int nr_queues , int flags , MPI_Comm comms);
+#else
 void qsched_init ( struct qsched *s , int nr_queues , int flags );
+#endif
 qsched_res_t qsched_addres_local ( struct qsched *s , int owner , qsched_res_t parent );
 void qsched_addlock ( struct qsched *s , qsched_task_t t , qsched_res_t res );
 void qsched_addunlock ( struct qsched *s , qsched_task_t ta , qsched_task_t tb );
diff --git a/src/queue.c b/src/queue.c
index ed9fbd38a1d6ad5ad572e8b469d1c861e6e6dace..f3d94bff5a4bcddd4215db2782a3af6838e3f8f1 100644
--- a/src/queue.c
+++ b/src/queue.c
@@ -80,7 +80,7 @@ int queue_get ( struct queue *q , struct qsched *s , int insist ) {
         tid = inds[k];
         
         /* If the task can be locked, break. */
-        if ( qsched_locktask_local( s , tid ) )
+        if ( qsched_locktask( s , tid ) )
             break;
     
         }
diff --git a/src/res.h b/src/res.h
index cdea3f49be7980bf3a38f1e05ff8e6db35d57928..72fad5a9d87a70a4096bec1ffe0f7ccc0718b031 100644
--- a/src/res.h
+++ b/src/res.h
@@ -49,10 +49,10 @@ struct res {
     int offset;
 
     /* The pointer to the tasks that use this resource.*/
-    int *users;
+    long long int *users;
 
     /* The pointer to the tasks that lock this resource.*/
-    int *lockers;
+    long long int *lockers;
 
     /* The number of tasks that use/lock this resource. */    
     int num_users, num_lockers; 
diff --git a/tests/Makefile.am b/tests/Makefile.am
new file mode 100644
index 0000000000000000000000000000000000000000..c6b5a7d26fb72257ce5da38ee5054f426f375dbc
--- /dev/null
+++ b/tests/Makefile.am
@@ -0,0 +1,42 @@
+# This file is part of QuickSched.
+# Coypright (c) 2015 Aidan Chalk (aidan.chalk@durham.ac.uk),
+# 
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \
+    -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
+    -DCPU_TPS=2.67e9 -DTIMERS -std=gnu99 \
+     -fsanitize=address -fno-omit-frame-pointer
+
+AM_LDFLAGS = -lm  # -fsanitize=address
+
+METIS_LIBS = @METIS_LIBS@
+MPI_THREAD_LIBS = @MPI_THREAD_LIBS@
+MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS)
+
+
+bin_PROGRAMS = test_mpi #test_mpi_synchronize
+
+# Sources for test
+test_mpi_LDADD =  ../src/.libs/libquickschedMPI.a $(METIS_LIBS)
+test_mpi_LDFLAGS = $(MPI_THREAD_LIBS) 
+test_mpi_SOURCES = test_mpi.c
+test_mpi_CFLAGS = $(AM_CFLAGS) -DWITH_MPI
+
+
+#test_mpi_synchronize_LDFLAGS = $(MPI_THREAD_LIBS)
+#test_mpi_synchronize_SOURCES = test_mpi_synchronize.c
+#test_mpi_synchronize_CFLAGS = $(AM_CFLAGS) -DWITH_MPI
+test_mpi_synchronize_LDADD =  ../src/.libs/libquickschedMPI.a
diff --git a/tests/test_mpi b/tests/test_mpi
new file mode 100755
index 0000000000000000000000000000000000000000..2ae7b31a415ac4f19fa7644aedf4998e517f7da4
Binary files /dev/null and b/tests/test_mpi differ
diff --git a/tests/test_mpi.c b/tests/test_mpi.c
new file mode 100644
index 0000000000000000000000000000000000000000..a6739fa84d8b1152203ec797245b925ae8590d6a
--- /dev/null
+++ b/tests/test_mpi.c
@@ -0,0 +1,86 @@
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <sched.h>
+#include <mpi.h>
+#include <metis.h>
+
+/* Local includes. */
+#include "quicksched.h"
+#include "res.h"
+
+
+int main(int argc, char *argv[]) {
+    // Initialize the MPI environment
+    MPI_Init(NULL, NULL);
+
+    struct qsched s;
+    
+    qsched_init(&s, 0, 0, MPI_COMM_WORLD);
+
+    // Get the name of the processor
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+
+    int *data;
+    long long int resid;
+    resid = qsched_addres( &s, qsched_owner_none, 8*sizeof(int), (void**)&data);
+    
+    qsched_sync_resources(&s);
+
+    long long int taskid;
+    taskid = qsched_addtask(&s, 2, 0, NULL, 0, 25);
+    qsched_addlock(&s, taskid, resid);
+    qsched_adduse(&s, taskid, resid);
+
+
+//    qsched_sync_schedulers(&s);
+    qsched_prepare_mpi(&s);
+    int i,j;
+    for(i = 0; i < s.task_ranks[s.count_ranks]; i++)
+    {
+        printf("s->tasks[i].id = %lli\n", s.tasks[i].id);
+    }
+
+    for(i = 0; i < s.count_lockers; i++)
+    {
+        printf("s->tasks[i].id = %lli\n", s.lockers[i]);
+    }
+
+    for(i = 0; i < s.res_ranks[s.count_ranks]; i++)
+    {
+        printf("resource[i] = %lli, %i %i\n", s.res[i].ID, s.res[i].num_lockers, s.res[i].num_users);
+        printf("users are");
+        for(j = 0; j < s.res[i].num_users; j++)
+        {
+            printf(", %lli", s.res[i].users[j]);
+        }
+        printf("\n");
+
+        printf("lockers are");
+        for(j = 0; j < s.res[i].num_lockers; j++)
+        {
+            printf(", %lli", s.res[i].lockers[j]);
+        }
+        printf("\n");
+    }
+
+    // Print off a hello world message
+    printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
+           processor_name, s.rank, s.count_ranks);
+
+    // Finalize the MPI environment.
+    MPI_Finalize();
+}
diff --git a/tests/test_mpi_synchronize.c b/tests/test_mpi_synchronize.c
new file mode 100644
index 0000000000000000000000000000000000000000..5e6af753682c22a1c59cb0a8adb2c551d325c3d3
--- /dev/null
+++ b/tests/test_mpi_synchronize.c
@@ -0,0 +1,48 @@
+
+/* Config parameters. */
+#include "../config.h"
+
+/* Some standard headers. */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <math.h>
+#include <float.h>
+#include <limits.h>
+#include <omp.h>
+#include <sched.h>
+#include <mpi.h>
+
+/* Local includes. */
+#include "quicksched.h"
+
+
+int main(int argc, char *argv[]) {
+    // Initialize the MPI environment
+    MPI_Init(NULL, NULL);
+
+    struct qsched s;
+    
+    qsched_init(&s, 0, 0, MPI_COMM_WORLD);
+
+    // Get the name of the processor
+    char processor_name[MPI_MAX_PROCESSOR_NAME];
+    int name_len;
+    MPI_Get_processor_name(processor_name, &name_len);
+
+    int *data;
+    qsched_addres( &s, qsched_owner_none, 8*sizeof(int), (void**)&data);
+    
+    qsched_sync_resources(&s);
+
+    qsched_addres( &s, qsched_owner_none, 8*sizeof(int), (void**)&data);
+
+    // Print off a hello world message
+    printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
+           processor_name, s.rank, s.count_ranks);
+
+    // Finalize the MPI environment.
+    MPI_Finalize();
+}