diff --git a/autom4te.cache/requests b/autom4te.cache/requests
index 6cd5e4a3a44e4986e168bef0c16985538948cd38..5f6702525b223f35d76288fa655f62b87988382b 100644
--- a/autom4te.cache/requests
+++ b/autom4te.cache/requests
@@ -68,11 +68,11 @@
                         '_LT_AC_LANG_C_CONFIG' => 1,
                         'AM_PROG_INSTALL_STRIP' => 1,
                         '_m4_warn' => 1,
-                        'AC_LIBTOOL_OBJDIR' => 1,
                         'AX_GCC_X86_CPUID' => 1,
+                        'AC_LIBTOOL_OBJDIR' => 1,
                         'gl_FUNC_ARGZ' => 1,
-                        'AM_SANITY_CHECK' => 1,
                         'LTOBSOLETE_VERSION' => 1,
+                        'AM_SANITY_CHECK' => 1,
                         'AC_LIBTOOL_LANG_GCJ_CONFIG' => 1,
                         'AC_LIBTOOL_PROG_COMPILER_PIC' => 1,
                         'LT_LIB_M' => 1,
@@ -85,21 +85,21 @@
                         'AC_LIBTOOL_GCJ' => 1,
                         'DX_CLEAR_DEPEND' => 1,
                         '_LT_WITH_SYSROOT' => 1,
-                        'LT_FUNC_DLSYM_USCORE' => 1,
                         'LT_SYS_DLOPEN_DEPLIBS' => 1,
-                        '_LT_AC_LANG_F77' => 1,
+                        'LT_FUNC_DLSYM_USCORE' => 1,
                         'AC_LIBTOOL_CONFIG' => 1,
-                        '_AM_SUBST_NOTMAKE' => 1,
+                        '_LT_AC_LANG_F77' => 1,
                         'AC_LTDL_DLLIB' => 1,
+                        '_AM_SUBST_NOTMAKE' => 1,
                         '_AM_AUTOCONF_VERSION' => 1,
                         'AM_DISABLE_SHARED' => 1,
                         '_LT_PROG_ECHO_BACKSLASH' => 1,
                         '_LTDL_SETUP' => 1,
-                        '_LT_AC_LANG_CXX' => 1,
                         'AM_PROG_LIBTOOL' => 1,
-                        'AC_LIB_LTDL' => 1,
-                        '_LT_AC_FILE_LTDLL_C' => 1,
+                        '_LT_AC_LANG_CXX' => 1,
                         'AM_PROG_LD' => 1,
+                        '_LT_AC_FILE_LTDLL_C' => 1,
+                        'AC_LIB_LTDL' => 1,
                         'DX_DOXYGEN_FEATURE' => 1,
                         'AU_DEFUN' => 1,
                         'AC_PROG_NM' => 1,
@@ -113,58 +113,58 @@
                         '_AM_SET_OPTION' => 1,
                         'AC_LTDL_PREOPEN' => 1,
                         '_LT_LINKER_BOILERPLATE' => 1,
-                        'AC_LIBTOOL_LANG_CXX_CONFIG' => 1,
-                        'AC_LIBTOOL_PROG_CC_C_O' => 1,
                         '_LT_PREPARE_SED_QUOTE_VARS' => 1,
+                        'AC_LIBTOOL_PROG_CC_C_O' => 1,
+                        'AC_LIBTOOL_LANG_CXX_CONFIG' => 1,
                         'gl_PREREQ_ARGZ' => 1,
                         'AX_EXT' => 1,
-                        'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
                         'LT_SUPPORTED_TAG' => 1,
-                        'LT_SYS_MODULE_EXT' => 1,
+                        'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
                         'LT_PROG_RC' => 1,
+                        'LT_SYS_MODULE_EXT' => 1,
                         'AC_DEFUN_ONCE' => 1,
                         'AX_CHECK_COMPILE_FLAG' => 1,
                         'DX_XML_FEATURE' => 1,
                         '_LT_AC_LANG_GCJ' => 1,
                         'AC_LTDL_OBJDIR' => 1,
-                        '_LT_PATH_TOOL_PREFIX' => 1,
                         'DX_TEST_FEATURE' => 1,
+                        '_LT_PATH_TOOL_PREFIX' => 1,
                         'AC_LIBTOOL_RC' => 1,
-                        'AM_SILENT_RULES' => 1,
-                        'AC_DISABLE_FAST_INSTALL' => 1,
                         '_LT_AC_PROG_ECHO_BACKSLASH' => 1,
+                        'AC_DISABLE_FAST_INSTALL' => 1,
+                        'AM_SILENT_RULES' => 1,
                         'DX_CHECK_DEPEND' => 1,
                         'DX_FEATURE_pdf' => 1,
-                        '_LT_AC_SYS_LIBPATH_AIX' => 1,
-                        '_LT_AC_TRY_DLOPEN_SELF' => 1,
-                        'DX_REQUIRE_PROG' => 1,
                         'include' => 1,
+                        'DX_REQUIRE_PROG' => 1,
+                        '_LT_AC_TRY_DLOPEN_SELF' => 1,
+                        '_LT_AC_SYS_LIBPATH_AIX' => 1,
                         'LT_AC_PROG_SED' => 1,
                         'AM_ENABLE_SHARED' => 1,
                         'DX_FEATURE_html' => 1,
                         'LTDL_INSTALLABLE' => 1,
-                        '_LT_AC_LANG_GCJ_CONFIG' => 1,
                         'DX_CURRENT_DESCRIPTION' => 1,
+                        '_LT_AC_LANG_GCJ_CONFIG' => 1,
                         'AC_ENABLE_SHARED' => 1,
-                        'AC_ENABLE_STATIC' => 1,
-                        'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1,
                         '_LT_REQUIRED_DARWIN_CHECKS' => 1,
-                        '_LT_AC_TAGVAR' => 1,
-                        'AM_PROG_CC_C_O' => 1,
+                        'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1,
+                        'AC_ENABLE_STATIC' => 1,
                         'AX_FUNC_POSIX_MEMALIGN' => 1,
+                        'AM_PROG_CC_C_O' => 1,
+                        '_LT_AC_TAGVAR' => 1,
                         'AC_LIBTOOL_LANG_F77_CONFIG' => 1,
                         'AM_CONDITIONAL' => 1,
                         'LT_LIB_DLLOAD' => 1,
                         'DX_FEATURE_dot' => 1,
-                        'LTDL_INIT' => 1,
-                        '_LT_PROG_F77' => 1,
-                        '_LT_PROG_CXX' => 1,
                         'LTVERSION_VERSION' => 1,
-                        'AM_PROG_INSTALL_SH' => 1,
+                        '_LT_PROG_CXX' => 1,
+                        '_LT_PROG_F77' => 1,
+                        'LTDL_INIT' => 1,
                         'm4_include' => 1,
+                        'AM_PROG_INSTALL_SH' => 1,
                         'AC_PROG_EGREP' => 1,
-                        '_AC_AM_CONFIG_HEADER_HOOK' => 1,
                         'AC_PATH_MAGIC' => 1,
+                        '_AC_AM_CONFIG_HEADER_HOOK' => 1,
                         'AC_LTDL_SYSSEARCHPATH' => 1,
                         'AM_MAKE_INCLUDE' => 1,
                         'LT_CMD_MAX_LEN' => 1,
@@ -180,51 +180,51 @@
                         'AC_PROG_LD_RELOAD_FLAG' => 1,
                         'DX_FEATURE_chi' => 1,
                         'AC_LTDL_DLSYM_USCORE' => 1,
-                        'LT_LANG' => 1,
                         'AM_MISSING_HAS_RUN' => 1,
+                        'LT_LANG' => 1,
                         'LT_SYS_DLSEARCH_PATH' => 1,
                         'LT_CONFIG_LTDL_DIR' => 1,
-                        'LT_OUTPUT' => 1,
                         'AC_LIBTOOL_DLOPEN_SELF' => 1,
+                        'LT_OUTPUT' => 1,
                         'AC_LIBTOOL_PROG_LD_SHLIBS' => 1,
-                        'AC_LIBTOOL_LINKER_OPTION' => 1,
                         'AC_WITH_LTDL' => 1,
+                        'AC_LIBTOOL_LINKER_OPTION' => 1,
                         'DX_CHI_FEATURE' => 1,
-                        'AC_LIBTOOL_CXX' => 1,
                         'LT_AC_PROG_RC' => 1,
+                        'AC_LIBTOOL_CXX' => 1,
                         'LT_INIT' => 1,
-                        'LT_SYS_DLOPEN_SELF' => 1,
-                        'LT_AC_PROG_GCJ' => 1,
                         'AX_CHECK_COMPILER_FLAGS' => 1,
+                        'LT_AC_PROG_GCJ' => 1,
+                        'LT_SYS_DLOPEN_SELF' => 1,
                         'DX_CURRENT_FEATURE' => 1,
-                        '_LT_AC_PROG_CXXCPP' => 1,
-                        'AM_DISABLE_STATIC' => 1,
                         'AM_DEP_TRACK' => 1,
-                        '_AC_PROG_LIBTOOL' => 1,
+                        'AM_DISABLE_STATIC' => 1,
+                        '_LT_AC_PROG_CXXCPP' => 1,
                         'AM_CONFIG_HEADER' => 1,
-                        '_AM_IF_OPTION' => 1,
+                        '_AC_PROG_LIBTOOL' => 1,
                         'DX_HTML_FEATURE' => 1,
+                        '_AM_IF_OPTION' => 1,
                         'AC_PATH_TOOL_PREFIX' => 1,
-                        'AC_LIBTOOL_F77' => 1,
                         'm4_pattern_allow' => 1,
+                        'AC_LIBTOOL_F77' => 1,
                         'AM_SET_LEADING_DOT' => 1,
-                        'LT_AC_PROG_EGREP' => 1,
                         '_LT_PROG_FC' => 1,
-                        '_AM_DEPENDENCIES' => 1,
+                        'LT_AC_PROG_EGREP' => 1,
                         'DX_DIRNAME_EXPR' => 1,
+                        '_AM_DEPENDENCIES' => 1,
                         'AC_LIBTOOL_LANG_C_CONFIG' => 1,
                         'LTOPTIONS_VERSION' => 1,
                         '_LT_AC_SYS_COMPILER' => 1,
-                        'AM_PROG_NM' => 1,
                         'DX_FEATURE_man' => 1,
+                        'AM_PROG_NM' => 1,
                         'DX_PS_FEATURE' => 1,
                         'AC_LIBLTDL_CONVENIENCE' => 1,
                         'AC_DEPLIBS_CHECK_METHOD' => 1,
-                        'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1,
                         'AC_LIBLTDL_INSTALLABLE' => 1,
+                        'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1,
                         'AC_LTDL_ENABLE_INSTALL' => 1,
-                        'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1,
                         'LT_PROG_GCJ' => 1,
+                        'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1,
                         'DX_FEATURE_chm' => 1,
                         'AM_INIT_AUTOMAKE' => 1,
                         'DX_FEATURE_rtf' => 1,
@@ -235,32 +235,32 @@
                         '_LT_AC_LOCK' => 1,
                         '_LT_AC_LANG_RC_CONFIG' => 1,
                         'LT_PROG_GO' => 1,
-                        'LT_SYS_MODULE_PATH' => 1,
                         'DX_ENV_APPEND' => 1,
-                        'AC_LIBTOOL_POSTDEP_PREDEP' => 1,
+                        'LT_SYS_MODULE_PATH' => 1,
                         'LT_WITH_LTDL' => 1,
-                        'AC_LTDL_SHLIBPATH' => 1,
-                        'AX_GCC_ARCHFLAG' => 1,
+                        'AC_LIBTOOL_POSTDEP_PREDEP' => 1,
                         'DX_ARG_ABLE' => 1,
+                        'AX_GCC_ARCHFLAG' => 1,
+                        'AC_LTDL_SHLIBPATH' => 1,
                         'AX_OPENMP' => 1,
                         'AM_AUX_DIR_EXPAND' => 1,
-                        '_LT_AC_LANG_F77_CONFIG' => 1,
                         'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1,
-                        '_AM_SET_OPTIONS' => 1,
+                        '_LT_AC_LANG_F77_CONFIG' => 1,
                         '_LT_COMPILER_OPTION' => 1,
-                        '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
+                        '_AM_SET_OPTIONS' => 1,
                         'AM_RUN_LOG' => 1,
-                        'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1,
-                        'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1,
+                        '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
                         'AC_LIBTOOL_PICMODE' => 1,
-                        'LT_PATH_LD' => 1,
-                        'AC_CHECK_LIBM' => 1,
+                        'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1,
+                        'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1,
                         'ACX_PTHREAD' => 1,
+                        'AC_CHECK_LIBM' => 1,
+                        'LT_PATH_LD' => 1,
                         'AC_LIBTOOL_SYS_LIB_STRIP' => 1,
                         '_AM_MANGLE_OPTION' => 1,
-                        'AC_LTDL_SYMBOL_USCORE' => 1,
-                        'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1,
                         'DX_FEATURE_xml' => 1,
+                        'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1,
+                        'AC_LTDL_SYMBOL_USCORE' => 1,
                         'AM_SET_DEPDIR' => 1,
                         '_LT_CC_BASENAME' => 1,
                         'DX_FEATURE_ps' => 1,
@@ -280,57 +280,57 @@
                         'configure.in'
                       ],
                       {
-                        '_LT_AC_TAGCONFIG' => 1,
                         'AM_PROG_F77_C_O' => 1,
-                        'AC_INIT' => 1,
+                        '_LT_AC_TAGCONFIG' => 1,
                         'm4_pattern_forbid' => 1,
-                        '_AM_COND_IF' => 1,
+                        'AC_INIT' => 1,
                         'AC_CANONICAL_TARGET' => 1,
-                        'AC_SUBST' => 1,
+                        '_AM_COND_IF' => 1,
                         'AC_CONFIG_LIBOBJ_DIR' => 1,
-                        'AC_FC_SRCEXT' => 1,
+                        'AC_SUBST' => 1,
                         'AC_CANONICAL_HOST' => 1,
+                        'AC_FC_SRCEXT' => 1,
                         'AC_PROG_LIBTOOL' => 1,
                         'AM_INIT_AUTOMAKE' => 1,
-                        'AM_PATH_GUILE' => 1,
                         'AC_CONFIG_SUBDIRS' => 1,
+                        'AM_PATH_GUILE' => 1,
                         'AM_AUTOMAKE_VERSION' => 1,
                         'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_REQUIRE_AUX_FILE' => 1,
                         'AC_CONFIG_LINKS' => 1,
-                        'm4_sinclude' => 1,
+                        'AC_REQUIRE_AUX_FILE' => 1,
                         'LT_SUPPORTED_TAG' => 1,
+                        'm4_sinclude' => 1,
                         'AM_MAINTAINER_MODE' => 1,
                         'AM_NLS' => 1,
                         'AC_FC_PP_DEFINE' => 1,
                         'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
-                        'AM_MAKEFILE_INCLUDE' => 1,
                         '_m4_warn' => 1,
+                        'AM_MAKEFILE_INCLUDE' => 1,
                         'AM_PROG_CXX_C_O' => 1,
-                        '_AM_COND_ENDIF' => 1,
                         '_AM_MAKEFILE_INCLUDE' => 1,
+                        '_AM_COND_ENDIF' => 1,
                         'AM_ENABLE_MULTILIB' => 1,
                         'AM_SILENT_RULES' => 1,
                         'AM_PROG_MOC' => 1,
                         'AC_CONFIG_FILES' => 1,
-                        'include' => 1,
                         'LT_INIT' => 1,
-                        'AM_PROG_AR' => 1,
+                        'include' => 1,
                         'AM_GNU_GETTEXT' => 1,
+                        'AM_PROG_AR' => 1,
                         'AC_LIBSOURCE' => 1,
-                        'AM_PROG_FC_C_O' => 1,
                         'AC_CANONICAL_BUILD' => 1,
+                        'AM_PROG_FC_C_O' => 1,
                         'AC_FC_FREEFORM' => 1,
-                        'AH_OUTPUT' => 1,
                         'AC_FC_PP_SRCEXT' => 1,
-                        '_AM_SUBST_NOTMAKE' => 1,
+                        'AH_OUTPUT' => 1,
                         'AC_CONFIG_AUX_DIR' => 1,
-                        'sinclude' => 1,
-                        'AM_PROG_CC_C_O' => 1,
+                        '_AM_SUBST_NOTMAKE' => 1,
                         'm4_pattern_allow' => 1,
-                        'AM_XGETTEXT_OPTION' => 1,
-                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_PROG_CC_C_O' => 1,
+                        'sinclude' => 1,
                         'AM_CONDITIONAL' => 1,
+                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_XGETTEXT_OPTION' => 1,
                         'AC_CONFIG_HEADERS' => 1,
                         'AC_DEFINE_TRACE_LITERAL' => 1,
                         'AM_POT_TOOLS' => 1,
@@ -351,57 +351,57 @@
                         'configure.in'
                       ],
                       {
-                        'AM_PROG_F77_C_O' => 1,
                         '_LT_AC_TAGCONFIG' => 1,
-                        'm4_pattern_forbid' => 1,
+                        'AM_PROG_F77_C_O' => 1,
                         'AC_INIT' => 1,
-                        '_AM_COND_IF' => 1,
+                        'm4_pattern_forbid' => 1,
                         'AC_CANONICAL_TARGET' => 1,
-                        'AC_SUBST' => 1,
+                        '_AM_COND_IF' => 1,
                         'AC_CONFIG_LIBOBJ_DIR' => 1,
-                        'AC_FC_SRCEXT' => 1,
+                        'AC_SUBST' => 1,
                         'AC_CANONICAL_HOST' => 1,
+                        'AC_FC_SRCEXT' => 1,
                         'AC_PROG_LIBTOOL' => 1,
                         'AM_INIT_AUTOMAKE' => 1,
-                        'AM_PATH_GUILE' => 1,
                         'AC_CONFIG_SUBDIRS' => 1,
+                        'AM_PATH_GUILE' => 1,
                         'AM_AUTOMAKE_VERSION' => 1,
                         'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_CONFIG_LINKS' => 1,
                         'AC_REQUIRE_AUX_FILE' => 1,
-                        'm4_sinclude' => 1,
+                        'AC_CONFIG_LINKS' => 1,
                         'LT_SUPPORTED_TAG' => 1,
+                        'm4_sinclude' => 1,
                         'AM_MAINTAINER_MODE' => 1,
                         'AM_NLS' => 1,
                         'AC_FC_PP_DEFINE' => 1,
                         'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
-                        'AM_MAKEFILE_INCLUDE' => 1,
                         '_m4_warn' => 1,
+                        'AM_MAKEFILE_INCLUDE' => 1,
                         'AM_PROG_CXX_C_O' => 1,
-                        '_AM_MAKEFILE_INCLUDE' => 1,
                         '_AM_COND_ENDIF' => 1,
+                        '_AM_MAKEFILE_INCLUDE' => 1,
                         'AM_ENABLE_MULTILIB' => 1,
                         'AM_PROG_MOC' => 1,
                         'AM_SILENT_RULES' => 1,
                         'AC_CONFIG_FILES' => 1,
-                        'LT_INIT' => 1,
                         'include' => 1,
-                        'AM_GNU_GETTEXT' => 1,
+                        'LT_INIT' => 1,
                         'AM_PROG_AR' => 1,
+                        'AM_GNU_GETTEXT' => 1,
                         'AC_LIBSOURCE' => 1,
-                        'AM_PROG_FC_C_O' => 1,
                         'AC_CANONICAL_BUILD' => 1,
+                        'AM_PROG_FC_C_O' => 1,
                         'AC_FC_FREEFORM' => 1,
-                        'AH_OUTPUT' => 1,
                         'AC_FC_PP_SRCEXT' => 1,
-                        'AC_CONFIG_AUX_DIR' => 1,
+                        'AH_OUTPUT' => 1,
                         '_AM_SUBST_NOTMAKE' => 1,
-                        'm4_pattern_allow' => 1,
-                        'sinclude' => 1,
+                        'AC_CONFIG_AUX_DIR' => 1,
                         'AM_PROG_CC_C_O' => 1,
-                        'AM_XGETTEXT_OPTION' => 1,
-                        'AC_CANONICAL_SYSTEM' => 1,
+                        'sinclude' => 1,
+                        'm4_pattern_allow' => 1,
                         'AM_CONDITIONAL' => 1,
+                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_XGETTEXT_OPTION' => 1,
                         'AC_CONFIG_HEADERS' => 1,
                         'AC_DEFINE_TRACE_LITERAL' => 1,
                         'AM_POT_TOOLS' => 1,
diff --git a/examples/test_qr.c b/examples/test_qr.c
index 97860917eb98eb1928a1ab37ce0568bc3f76ec38..d5d2fb8db9991dba697b0d0620976955b3d3f1ac 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -34,6 +34,17 @@
 /* Local includes. */
 #include "quicksched.h"
 
+void printMatrix(double* Matrix, int m, int n, int tilesize) {
+  int i, j;
+
+  for (i = 0; i < m * tilesize; i++) {
+    for (j = 0; j < n * tilesize; j++) {
+      printf(" %.3f ", Matrix[j * m * tilesize + i]);
+    }
+    printf("\n");
+  }
+}
+
 /**
  * Takes a column major matrix, NOT tile major. size is length of a side of the
  * matrix. Only works for square matrices.
@@ -85,14 +96,13 @@ double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
                   1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
           }
         }
-
         /* Qtemp = Qtemp * (I-tau*w*w')' */
         cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
                     1.0, Qtemp, size, ww, size, 0.0, temp, size);
         double* b = Qtemp;
         Qtemp = temp;
         temp = b;
-      }
+          }
       cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
                   1.0, Q, size, Qtemp, size, 0.0, temp, size);
       double* b = Q;
@@ -120,16 +130,7 @@ double* getR(double* HR, int size) {
   return R;
 }
 
-void printMatrix(double* Matrix, int m, int n, int tilesize) {
-  int i, j;
 
-  for (i = 0; i < m * tilesize; i++) {
-    for (j = 0; j < n * tilesize; j++) {
-      printf(" %.3f ", Matrix[j * m * tilesize + i]);
-    }
-    printf("\n");
-  }
-}
 
 double* columnToTile(double* columnMatrix, int size, int m, int n,
                      int tilesize) {
@@ -395,6 +396,20 @@ void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
   }
 }
 
+/* Generates a random matrix. */
+double* generateColumnMatrix(int size, unsigned long int m_z) {
+  double* matrix = malloc(sizeof(double) * size);
+  if (matrix == NULL) error("Failed to allocate matrix");
+
+  int i;
+  for (i = 0; i < size; i++) {
+    m_z = (1664525 * m_z + 1013904223) % 4294967296;
+    matrix[i] = m_z % 100;
+    if (matrix[i] < 0) matrix[i] += 100;
+  }
+  return matrix;
+}
+
 /**
  *
  * @brief Applies the householder factorisation of the corner to the row tile.
@@ -511,6 +526,8 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
     else
       A_orig[k] = matrix[k];
   }
+    if(matrix == NULL)
+        A_orig = generateColumnMatrix(m*n*K*K, 35532);
   memcpy(A, A_orig, sizeof(double) * m * n * K * K);
   bzero(tau, sizeof(double) * m * n * K);
 
@@ -588,7 +605,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
         qsched_addlock(&s, tid_new, rid[j * m + i]);
         qsched_adduse(&s, tid_new, rid[k * m + i]);
         qsched_adduse(&s, tid_new, rid[j * m + k]);
-        // qsched_addunlock(&s, tid[k * m + i], tid_new);
+        qsched_addunlock(&s, tid[k * m + i], tid_new);
         qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
         if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
 
@@ -600,11 +617,11 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
   tot_setup = getticks() - tic;
 
   /* Dump the number of tasks. */
-  message("total nr of tasks: %i.", s.count);
-  message("total nr of deps: %i.", s.count_deps);
-  message("total nr of res: %i.", s.count_res);
-  message("total nr of locks: %i.", s.count_locks);
-  message("total nr of uses: %i.", s.count_uses);
+  //message("total nr of tasks: %i.", s.count);
+  //message("total nr of deps: %i.", s.count_deps);
+  //message("total nr of res: %i.", s.count_res);
+  //message("total nr of locks: %i.", s.count_locks);
+  //message("total nr of uses: %i.", s.count_uses);
 
   /* Loop over the number of runs. */
   for (k = 0; k < runs; k++) {
@@ -613,7 +630,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
     tic = getticks();
     qsched_run(&s, nr_threads, runner);
     toc_run = getticks();
-    message("%ith run took %lli ticks...", k, toc_run - tic);
+   // message("%ith run took %lli ticks...", k, toc_run - tic);
     tot_run += toc_run - tic;
   }
 
@@ -643,20 +660,21 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
       } */
 
   /* Dump the costs. */
-  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
-          tot_run / runs);
+    double itpms = 1000.0 / CPU_TPS;
+  message("costs: setup=%.3f ms, run=%.3f ms.", ((double)tot_setup)*itpms,
+          ((double)tot_run)*itpms);
 
   /* Dump the timers. */
-  for (k = 0; k < qsched_timer_count; k++)
+  /*for (k = 0; k < qsched_timer_count; k++)
     message("timer %s is %lli ticks.", qsched_timer_names[k],
-            s.timers[k] / runs);
+            s.timers[k] / runs);*/
 
   if (matrix != NULL) {
     for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k];
   }
 
   /* Test if the decomposition was correct.*/
-  /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+/*  double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
    double *R = getR(tempMatrix, m*K);
    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
@@ -683,19 +701,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
   qsched_free(&s);
 }
 
-/* Generates a random matrix. */
-double* generateColumnMatrix(int size, unsigned long int m_z) {
-  double* matrix = malloc(sizeof(double) * size * size);
-  if (matrix == NULL) error("Failed to allocate matrix");
 
-  int i;
-  for (i = 0; i < size * size; i++) {
-    m_z = (1664525 * m_z + 1013904223) % 4294967296;
-    matrix[i] = m_z % 100;
-    if (matrix[i] < 0) matrix[i] += 100;
-  }
-  return matrix;
-}
 
 
 /* Unit tests for the tiled QR decomposition.*/
@@ -703,10 +709,10 @@ void test_SSSRFT()
 {
     double* cornerTile, *rowTile, *columnTile, *otherTile;
     double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35136);
-    rowTile = generateColumnMatrix(32, 35239);
-    columnTile = generateColumnMatrix(32, 35339);
-    otherTile = generateColumnMatrix(32, 35739);
+    cornerTile = generateColumnMatrix(32*32, 35136);
+    rowTile = generateColumnMatrix(32*32, 35239);
+    columnTile = generateColumnMatrix(32*32, 35339);
+    otherTile = generateColumnMatrix(32*32, 35739);
 DGEQRF(cornerTile, 32, tau, 0, 2);
     DTSQRF(cornerTile, columnTile,
             32, 1, 0, tau,
@@ -733,8 +739,8 @@ void test_STSQRF()
 {
     double *cornerTile, *columnTile;
     double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35531);
-    columnTile = generateColumnMatrix(32, 35585);
+    cornerTile = generateColumnMatrix(32*32, 35531);
+    columnTile = generateColumnMatrix(32*32, 35585);
     DGEQRF(cornerTile, 32, tau, 0, 2);
     DTSQRF(cornerTile, columnTile,
             32, 1, 0, tau,
@@ -749,8 +755,8 @@ void test_SLARFT()
 {
     double* cornerTile, *rowTile;
     double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35536);
-    rowTile = generateColumnMatrix(32, 35539);
+    cornerTile = generateColumnMatrix(32*32, 35536);
+    rowTile = generateColumnMatrix(32*32, 35539);
     DGEQRF(cornerTile, 32,tau, 0, 2);
     DLARFT(cornerTile,rowTile, 32,
             1, 0, tau, 2); 
@@ -766,7 +772,7 @@ void test_SGEQRF()
 {
     double* cornerTile;
     double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35532);
+    cornerTile = generateColumnMatrix(32*32, 35532);
     DGEQRF( cornerTile, 32,
             tau, 0, 2);
     printMatrix(cornerTile, 1, 1, 32);
@@ -774,10 +780,10 @@ void test_SGEQRF()
 }
 void runTests()
 {
- //   test_SGEQRF();
+   test_SGEQRF();
 //    test_SLARFT();
 //    test_STSQRF();
-    test_SSSRFT();
+//    test_SSSRFT();
 }
 
 
diff --git a/examples/test_qr.cu b/examples/test_qr.cu
index 95b6ad51ea76ebe51707e5ad37bd33046bf6338b..1e8be2456b665d6f53395e0a07ed1604c2d60687 100644
--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -7,13 +7,18 @@
 #include <string.h>
 #include <unistd.h>
 #include <math.h>
+#include <gperftools/profiler.h>
+
+
 
 /* Local includes. */
 extern "C"{
 #include "quicksched.h"
+#include <cblas.h>
 }
 #include "cuda_queue.h"
 
+int g_size;
 enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;
 
 //#define TID threadIdx.x
@@ -28,6 +33,7 @@ __device__ float *GPU_tau;
 __device__ int cuda_m;
 __device__ int cuda_n;
 
+
 void printMatrix(float *matrix, int m, int n, int tilesize)
 {
     int i, j;
@@ -44,16 +50,110 @@ void printMatrix(float *matrix, int m, int n, int tilesize)
 
 }
 
+/**
+ * Takes a column major matrix, NOT tile major. size is length of a side of the
+ * matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know
+ * of no implementation to retrieve Q from the tiled QR.
+ */
+float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) {
+  float* Q = (float*)malloc(sizeof(float) * size * size);
+  float* Qtemp = (float*)malloc(sizeof(float) * size * size);
+  float* w = (float*)malloc(sizeof(float) * size);
+  float* ww = (float*)malloc(sizeof(float) * size * size);
+  float* temp = (float*)malloc(sizeof(float) * size * size);
+  int i, k, l, j, n;
+  bzero(Q, sizeof(float) * size * size);
+  bzero(Qtemp, sizeof(float) * size * size);
+  bzero(ww, sizeof(float) * size * size);
+
+  for (i = 0; i < size; i++) {
+    Q[i * size + i] = 1.0;
+  }
+  int numcoltile = size / tilesize;
+  int numrowtile = size / tilesize;
+  for (k = 0; k < numrowtile; k++) {
+    for (l = 0; l < tilesize; l++) {
+      bzero(Qtemp, sizeof(float) * size * size);
+      for (i = 0; i < size; i++) {
+        Qtemp[i * size + i] = 1.0;
+      }
+
+      for (i = k; i < numcoltile; i++) {
+        bzero(w, sizeof(float) * size);
+
+        for (j = 0; j < tilesize; j++) {
+          w[i * tilesize + j] =
+              HR[(k * tilesize + l) * size + i * tilesize + j];
+        }
+        w[k * tilesize + l] = 1.0;
+        if (k * tilesize + l > i * tilesize) {
+          for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
+        }
+
+        /* Compute (I - tau*w*w')' */
+        for (j = 0; j < size; j++) {
+          for (n = 0; n < size; n++) {
+            if (j != n)
+              ww[n * size + j] =
+                  -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+            else
+              ww[n * size + j] =
+                  1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+          }
+        }
+        /* Qtemp = Qtemp * (I-tau*w*w')' */
+        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                    1.0, Qtemp, size, ww, size, 0.0, temp, size);
+        float* b = Qtemp;
+        Qtemp = temp;
+        temp = b;
+            
+      }
+      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                  1.0, Q, size, Qtemp, size, 0.0, temp, size);
+      float* b = Q;
+      Q = temp;
+      temp = b;
+    }
+  }
+
+  free(Qtemp);
+  free(w);
+  free(ww);
+  free(temp);
+  return Q;
+}
+
+float* getR(float* HR, int size) {
+  float* R = (float*)malloc(sizeof(float) * size * size);
+  int i, j;
+  bzero(R, sizeof(float) * size * size);
+  for (i = 0; i < size; i++) {
+    for (j = 0; j <= i; j++) {
+      R[i * size + j] = HR[i * size + j];
+    }
+  }
+  return R;
+}
+
+
+
 /* Generates a random matrix. */
 float* generateColumnMatrix(int size, unsigned long int m_z)
 {
     float* matrix;
-    cudaMallocHost(&matrix, sizeof(float)*size*size);
+    
+    cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/);
+    if(code != cudaSuccess)
+        printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size);
+       else
+        g_size = g_size + sizeof(float)*size*size;
     if(matrix == NULL)
         error("Failed to allocate matrix");
 
     int i;
-    for(i = 0 ; i < size*size; i++)
+    for(i = 0 ; i < size; i++)
     {
         m_z = (1664525*m_z + 1013904223) % 4294967296;
         matrix[i] = m_z % 100;
@@ -454,43 +554,39 @@ __device__ void runner ( int type , void *data ) {
 	workVector = blockCache + (32*32);
         /* Decode the task data. */
         int *idata = (int *)data;
-        int i = idata[0], j = idata[1], k = idata[2];
+        int i = idata[0];
+        int j = idata[1];
+        int k = idata[2];
     //    int z;
 //        double buff[ 2*K*K ];
         /* Decode and execute the task. */
         switch ( type ) {
             case task_SGEQRF:
+                //if(threadIdx.x == 0)
+                  //  printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
                 if(threadIdx.x < 32)
-//                    runner_cuda_SGEQRF(i, j, k, workVector, blockCache);
-
-                if(threadIdx.x == 0){
-                    printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock());
-                printf("tau = ");
-                for(k = 0; k < cuda_m * cuda_n * 32 ; k++)
-                {
-                    printf("%.3f ", GPU_tau[k]);
-                }
-                printf("\n");}
+                SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
                 break;
             case task_SLARFT:
-                if(threadIdx.x == 0)
-                    printf("SLARFT: %i %i %i + %i\n",i,j,k, clock());
-                //runner_cuda_SLARFT(i, j, k, workVector, blockCache);
+        //        if(threadIdx.x == 0)
+        //            printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32);
+                SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m);
                 break;
             case task_STSQRF:
-                if(threadIdx.x == 0)
-                    printf("STSQRF: %i %i %i + %i\n",i,j,k, clock());
-                //runner_cuda_STSQRF(i, j, k, workVector, blockCache);
+          //      if(threadIdx.x == 0)
+         //           printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32);
+                if(threadIdx.x < 32)
+                    STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m);
                 break;
             case task_SSSRFT:
-                if(threadIdx.x == 0)
-                    printf("SSSRFT: %i %i %i + %i\n",i,j,k, clock());
-                //runner_cuda_SSSRFT(i, j, k, workVector, blockCache);
+         //       if(threadIdx.x == 0)
+         //           printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32);
+                SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j,        k, GPU_tau, cuda_m );
                 break;
             default:
                 asm("trap;");
             }
-
+__syncthreads();
 }
 
 __device__ qsched_funtype function = runner;
@@ -505,6 +601,7 @@ float* generateMatrix( int m, int n)
 {
     float* Matrix;
     cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
+    g_size = g_size + sizeof(float)*m*n*32*32;
     if(Matrix == NULL)
         error("Failed to allocate Matrix");
     int i, j;
@@ -557,40 +654,33 @@ float* columnToTile( float* columnMatrix, int size , int m , int n )
 
 }
 
-float* tileToColumn( float* tileMatrix, int size, int m , int n )
-{
-    float* ColumnMatrix;
-    ColumnMatrix = (float*) malloc(sizeof(float) * size );
-    if(ColumnMatrix == NULL)
-        error("failed to allocate ColumnMatrix");
-    int rows = m*32;
-    int columns = n*32;
-    int i,j,k,l;
-    for( i = 0; i < n ; i++ )
-    {
-        for(j = 0; j < m ; j++ )
-        {
-            /* Tile on ith column is at i*m*32*32.*/
-            /* Tile on jth is at j*32*32 */
-            float *tile = &tileMatrix[i*m*32*32 + j*32*32];
-            /* Column starts at same position as tile. */
-            /* Row j*32.*/
-            float *tilePos = &ColumnMatrix[i*m*32*32 + j*32];
-            for( k = 0; k < 32; k++ )
-            {
-                for(l=0; l < 32; l++)
-                {
-                    tilePos[l] = tile[l];
-                }
-                /* Next 32 elements are the position of the tile in the next column.*/
-                tile = &tile[32];
-                /* Move to the j*32th position in the next column. */                
-                tilePos = &tilePos[32*m];
-
-            }
-        }   
+float* tileToColumn(float* tileMatrix, int size, int m, int n, int tilesize) {
+  float* ColumnMatrix;
+  ColumnMatrix = (float*)malloc(sizeof(float) * size);
+  if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+  int i, j, k, l;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      /* Tile on ith column is at i*m*32*32.*/
+      /* Tile on jth is at j*32*32 */
+      float* tile =
+          &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      /* Column starts at same position as tile. */
+      /* Row j*32.*/
+      float* tilePos =
+          &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        for (l = 0; l < tilesize; l++) {
+          tilePos[l] = tile[l];
+        }
+        /* Next 32 elements are the position of the tile in the next column.*/
+        tile = &tile[tilesize];
+        /* Move to the j*32th position in the next column. */
+        tilePos = &tilePos[tilesize * m];
+      }
     }
-    return ColumnMatrix;
+  }
+  return ColumnMatrix;
 }
 
 
@@ -672,24 +762,29 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
  /* Initialize the scheduler. */
     qsched_init( &s , 1 , qsched_flag_none );
     cudaDeviceReset();
+    if( cudaPeekAtLastError() != cudaSuccess)
+        error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));    
     cudaSetDevice(0);
     Setup<<<1,1>>>();
-    if(cudaDeviceSynchronize() != cudaSuccess)
+    g_size = 0;
+  if(cudaDeviceSynchronize() != cudaSuccess)
     error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
     
     /* Allocate and fill the original matrix. */
     if(cudaMallocHost(&A, sizeof(float) * m * n * K * K  ) != cudaSuccess ||
-        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess ||
-        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess )
+        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*||
+        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess*/ )
         error("Failed to allocate matrices.");
-    cudaFreeHost(A_orig);
+    g_size = g_size + sizeof(float) * m * n * K * K  + sizeof(float) * m * n * K;
+    //cudaFreeHost(A_orig);
 //    for ( k = 0 ; k < m * n * K * K ; k++ )
   //      A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
-    A_orig = generateMatrix(m, n);
+    A_orig = generateColumnMatrix(m*n*K*K, 35532);
+  //  printMatrix(A_orig, m, n);
     float *temp = columnToTile(A_orig, m * n * K *K,  m , n);
     cudaFreeHost(A_orig);
     A_orig = temp;
-    temp = tileToColumn(A_orig, m*n*K*K, m , n);
+    temp = tileToColumn(A_orig, m*n*K*K, m , n, K);
 //    printMatrix(temp, m, n);
     
 //    printTileMatrix(A_orig, m, n);
@@ -721,6 +816,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
         tid[k] = qsched_task_none;
         if( cudaHostGetDevicePointer(&temp_device_array , &A[k*32*32] , 0) != cudaSuccess)
             error("Failed to get device pointer to matrix");
+
         rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32);
         }
 
@@ -730,22 +826,20 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
         /* Add kth corner task. */
         data[0] = k; data[1] = k; data[2] = k;
         tid_new = qsched_addtask( &s , task_SGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
-        qsched_addlock( &s , tid_new , rid[ k*m + k ] );
-        if ( tid[ k*m + k ] != -1 )
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-        tid[ k*m + k ] = tid_new;
+        qsched_addlock(&s, tid_new, rid[k * m + k]);
+        if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
+        tid[k * m + k] = tid_new;
 
 
         /* Add column tasks on kth row. */
         for ( j = k+1 ; j < n ; j++ ) {
             data[0] = k; data[1] = j; data[2] = k;
             tid_new = qsched_addtask( &s , task_SLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ j*m + k ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-            if ( tid[ j*m + k ] != -1 )
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
-            tid[ j*m + k ] = tid_new;
+            qsched_addlock(&s, tid_new, rid[j * m + k]);
+            qsched_adduse(&s, tid_new, rid[k * m + k]);
+            qsched_addunlock(&s, tid[k * m + k], tid_new);
+            if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
+            tid[j * m + k] = tid_new;
             }
             
         /* For each following row... */
@@ -754,25 +848,23 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
             /* Add the row tasks for the kth column. */
             data[0] = i; data[1] = k; data[2] = k;
             tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ k*m + i ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
-            if ( tid[ k*m + i ] != -1 )
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-            tid[ k*m + i ] = tid_new;
-            
+            qsched_addlock(&s, tid_new, rid[k * m + i]);
+           qsched_adduse(&s, tid_new, rid[k * m + k]);
+            qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
+            if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
+            tid[k * m + i] = tid_new;
+              
             /* Add the inner tasks. */
             for ( j = k+1 ; j < n ; j++ ) {
                 data[0] = i; data[1] = j; data[2] = k;
                 tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
-                qsched_addlock( &s , tid_new , rid[ j*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ k*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ j*m + k ] );
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
-                if ( tid[ j*m + i ] != -1 )
-                    qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
-                tid[ j*m + i ] = tid_new;
+                qsched_addlock(&s, tid_new, rid[j * m + i]);
+                qsched_adduse(&s, tid_new, rid[k * m + i]);
+                qsched_adduse(&s, tid_new, rid[j * m + k]);
+                qsched_addunlock(&s, tid[k * m + i], tid_new);
+                qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
+                if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
+                tid[j * m + i] = tid_new;
                 }
         
             }
@@ -787,11 +879,42 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
     if( cudaMemcpyToSymbol( cuda_n, &n, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy n to the device");    
 
+
         qsched_run_CUDA( &s , func );
-    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
-    A = tileToColumn(A,m * n * K * K, m, n);
-    printMatrix(A, m, n);
+
+    if(cudaMemcpy(  tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess )
+        error("Failed to copy the tau data from the device.");
+
+   // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
+   /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+   float *Q = computeQ(tempMatrix, m*K, K, tau, m);
+   float *R = getR(tempMatrix, m*K);
+   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
+   m*K, R, m*K, 0.0, tempMatrix, m*K);
+       free(Q);
+   // printMatrix(tempMatrix, m, n);
+   // printf("\n\n\n\n");
+       Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+   for(i = 0; i < m * n * K * K; i++)
+   {
+       if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] <
+   0.995))
+           printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
+   tempMatrix[i]);
+   }
+   free(tempMatrix);
+   free(Q);
+   free(R);*/
+
+//    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
+ //   A = tileToColumn(A,m * n * K * K, m, n, K);
+   // printMatrix(A, m, n);
 //    printTileMatrix(A, m , n);
+    struct task* tasks =  qsched_get_timers( &s , s.count );
+    /*for(i = 0; i < s.count; i++)
+    {
+        printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+    }*/
 
 }
 
@@ -812,7 +935,7 @@ void test()
     printf("\n \n \n");
     //printTileMatrix(MatrixTile, m, n);
     free(Matrix);
-    Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n);
+    Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32);
     printMatrix(Matrix, m, n);
     free(MatrixTile);
     free(Matrix);
@@ -832,11 +955,11 @@ __global__ void SGEQRF_test(float* cornerTile)
     
     for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
     {
-        printf("Copying value %i\n", i);
+       // printf("Copying value %i\n", i);
         tile[i] = cornerTile[i];
     }
 
-    printf("blockDim.x = %i\n", blockDim.x);
+   // printf("blockDim.x = %i\n", blockDim.x);
     __syncthreads();
     if(threadIdx.x < 32)
         SGEQRF(tile, 32, tau, 0, 1);
@@ -849,7 +972,7 @@ __global__ void SGEQRF_test(float* cornerTile)
 void test_SGEQRF()
 {
     float* cornerTile;
-    cornerTile = generateColumnMatrix(32, 35532);
+    cornerTile = generateColumnMatrix(32*32, 35532);
     SGEQRF_test<<<1, 128>>>(cornerTile);
     cudaDeviceSynchronize();
     printMatrix(cornerTile, 1, 1, 32);
@@ -886,8 +1009,8 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile)
 void test_SLARFT()
 {
     float* cornerTile, *rowTile;
-    cornerTile = generateColumnMatrix(32, 35536);
-    rowTile = generateColumnMatrix(32, 35539);
+    cornerTile = generateColumnMatrix(32*32, 35536);
+    rowTile = generateColumnMatrix(32*32, 35539);
     SLARFT_test<<<1, 128>>>(cornerTile, rowTile);
     cudaDeviceSynchronize();
     printMatrix(cornerTile, 1, 1, 32);
@@ -923,8 +1046,8 @@ __global__ void STSQRF_test(float* cornerTile, float* columnTile)
 void test_STSQRF()
 {
     float *cornerTile, *columnTile;
-    cornerTile = generateColumnMatrix(32, 35531);
-    columnTile = generateColumnMatrix(32, 35585);
+    cornerTile = generateColumnMatrix(32*32, 35531);
+    columnTile = generateColumnMatrix(32*32, 35585);
     STSQRF_test<<<1,32>>>(cornerTile, columnTile);
     cudaDeviceSynchronize();
     printMatrix(cornerTile, 1, 1, 32);
@@ -971,10 +1094,10 @@ __global__ void SSSRFT_test(float* cornerTile, float* columnTile, float* rowTile
 void test_SSSRFT()
 {
     float* cornerTile, *rowTile, *columnTile, *otherTile;
-    cornerTile = generateColumnMatrix(32, 35136);
-    rowTile = generateColumnMatrix(32, 35239);
-    columnTile = generateColumnMatrix(32, 35339);
-    otherTile = generateColumnMatrix(32, 35739);
+    cornerTile = generateColumnMatrix(32*32, 35136);
+    rowTile = generateColumnMatrix(32*32, 35239);
+    columnTile = generateColumnMatrix(32*32, 35339);
+    otherTile = generateColumnMatrix(32*32, 35739);
     SSSRFT_test<<<1, 128>>>(cornerTile, columnTile, rowTile, otherTile);
     cudaDeviceSynchronize();
     printMatrix(cornerTile, 1, 1, 32);
@@ -995,10 +1118,10 @@ void test_SSSRFT()
 
 void runTests()
 {
-   // test_SGEQRF();
+    test_SGEQRF();
   //  test_SLARFT();
  //   test_STSQRF();
-    test_SSSRFT();
+ //   test_SSSRFT();
 }
 /**
  * @brief Main function.
@@ -1056,5 +1179,6 @@ int main ( int argc , char *argv[] ) {
         
     test_qr( M , N , K , nr_threads , runs );
     
+    printf("\n");
     }
 
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 6e35f8390e5f8f9cfa1f858e980587393b7555a8..45add67c108e3d9a0c2ddcfdb07d76c4eb58a1b8 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -23,6 +23,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <gperftools/profiler.h>
 
 extern "C"{
 #include "quicksched.h"
@@ -52,6 +53,7 @@ __device__ qsched_task_t *deps_cuda;
 __device__ struct res *res_cuda;
 __device__ char *data_cuda;
 __device__ int cuda_barrier = 0;
+__device__ volatile int tot_num_tasks;
 
 __device__ qsched_funtype fun;
 /**
@@ -95,12 +97,11 @@ __device__ __inline__ void cuda_memcpy_tasks ( void *dest , void *source , int c
     /* Copy the data in chunks of sizeof(int). */
     for ( k = threadIdx.x ; k < count/sizeof(int) ; k += blockDim.x ){
         idest[k] = isource[k];
-//        idest[k] *= tid;
         }
         
     }
  
-__device__ int cuda_queue_gettask ( struct queue_cuda *q, int wait ) {
+__device__ int cuda_queue_gettask ( struct queue_cuda *q ) {
 
     int ind, tid = -1;
     
@@ -114,12 +115,14 @@ __device__ int cuda_queue_gettask ( struct queue_cuda *q, int wait ) {
     /* Wrap the index. */
     ind %= cuda_queue_size; 
     /* Loop until there is a valid task at that index. */
-    while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0 && wait);
+    while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0);
     
     /* Scratch the task from the queue */
     if ( tid >= 0 )
+    {
         q->data[ind] = -1;
-
+        atomicAdd((int*) &tot_num_tasks, -1);
+    }
     /* Return the acquired task ID. */
     return tid;
     
@@ -144,6 +147,8 @@ __device__ void cuda_queue_puttask ( struct queue_cuda *q , int tid ) {
 
     /* Write the task back to the queue. */
     q->data[ind] = tid;
+
+    atomicAdd((int*)&q->nr_avail_tasks, 1);
     
     }
 
@@ -266,8 +271,6 @@ __device__ int cuda_locktask ( int tid ) {
 
 }
 
-//TODO
-
 /**
  * @brief Tell the #qsched that a task has completed.
  *
@@ -279,27 +282,10 @@ __device__ void cuda_done ( struct task *t ) {
 
     int k;
     struct task *t2;
-        
     
     /* Release this task's locks. */
     for ( k = 0 ; k < t->nr_locks ; k++ )
         cuda_unlockres( t->locks[k] );
-    
-    /* Loop over the task's unlocks... */
-    //for ( k = 0 ; k < t->nr_unlocks ; k++ ) {
-    
-        /* Get a grip on the unlocked task. */
-      //  t2 = &tasks_cuda[ t->unlocks[k] ];
-
-        /* Is the unlocked task ready to run? */
-        //if ( atomicAdd( &t2->wait, -1 ) == 1 && !( t2->flags & task_flag_skip ) )
-          //      cuda_queue_puttask( &cuda_queues[0] , t->unlocks[k] );
-            
-        //}
-        
-    /* Set the task stats. */
-    //t->toc = getticks();
-    //t->cost = t->toc - t->tic;
 
     }
 
@@ -314,16 +300,21 @@ __device__ void cuda_done ( struct task *t ) {
  * This routine blocks until a valid task is picked up, or the
  * specified queue is empty.
  */
-__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) {
+__device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
 
     int tid = -1;
-    
+    if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0)
+    {
+        atomicAdd((int*)&q->nr_avail_tasks, 1);
+        return -1;
+    }
     
     /* Main loop. */
-    while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) {
+    while ( ( tid = cuda_queue_gettask( q ) ) >= 0 ) {
         if( cuda_locktask(tid) == 1 )
             break;    
 
+        atomicAdd((int*)&q->nr_avail_tasks, -1);
         cuda_queue_puttask ( q , tid );
         }
         
@@ -353,13 +344,17 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) {
  * This routine blocks until a valid task is picked up, or the
  * specified queue is empty.
  */
-__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) {
+__device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
 
     int tid = -1;
-    
+    if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0)
+    {
+        atomicAdd((int*)&q->nr_avail_tasks, 1);
+        return -1;
+    }
     
     /* Main loop. */
-    while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) {
+    while ( ( tid = cuda_queue_gettask( q ) ) >= 0 ) {
         break;    
         }
         
@@ -380,6 +375,7 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) {
 __global__ void qsched_device_kernel ( )
 {
     volatile __shared__ int tid;
+    volatile __shared__ int done;
     int *src, *dest;
     int i;
 
@@ -389,27 +385,42 @@ __global__ void qsched_device_kernel ( )
         __syncthreads();
         if ( threadIdx.x == 0 ) {
              tid = -1;
-             if( cuda_queues[0].count == cuda_queues[0].rec_count )
-                tid = runner_cuda_gettask ( &cuda_queues[1], 1);
-             else
-                tid = runner_cuda_gettask ( &cuda_queues[1], 0);
+                /* Highest priority queue, holds the unload tasks. */
+                if(cuda_queues[2].nr_avail_tasks > 0 )
+                tid = runner_cuda_gettask( &cuda_queues[2] );
+
+             /* Middle priority queue, contains user-specifed tasks. */
+             if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 )
+                tid = runner_cuda_gettask ( &cuda_queues[0]);
 
-             if( tid < 0 )
-                tid = runner_cuda_gettask ( &cuda_queues[0], 1);
+             /* Low priority queue, contains the load tasks. */
+             if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0)
+                tid = runner_cuda_gettask ( &cuda_queues[1]);
             }
             
         /*Everyone wait for us to get a task id*/
         __syncthreads();
             
-        /* Exit if we didn't get a valid task. */
-        if(tid < 0)
+        /* Exit if all tasks have been taken from queues and we don't have work to do. */
+        if(tid < 0 && tot_num_tasks == 0)
             break;
 
+        #ifdef DEBUG_GPU
+        if(tid < 0 && cuda_queues[0].nr_avail_tasks == 0 && cuda_queues[1].nr_avail_tasks == 0 && cuda_queues[2].nr_avail_tasks == 0)
+            break;
+        #endif
+        /* If we couldn't find a task but some are not completed, try again. */
+        if(tid < 0)
+            continue;
 
-        /*Start the task clock*/
-        if( threadIdx.x == 0 ) 
-            tasks_cuda[tid].tic = clock();
 
+        /*Start the task clock*/
+        if( threadIdx.x == 0 ){ 
+            tasks_cuda[tid].blockID = blockIdx.x;
+            tasks_cuda[tid].tic = clock64();
+        }
+        
+        /* Pick task type to do, if its not load, unload or ghost use the user supplied function. */
         if( tasks_cuda[tid].type == type_load )
         {
         int *d = (int*)&data_cuda[tasks_cuda[tid].data];
@@ -422,25 +433,31 @@ __global__ void qsched_device_kernel ( )
             src = (int*)res_cuda[d[0]].gpu_data;
             dest = (int*)res_cuda[d[0]].data;
             cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, d[0]);
-        }else{
+        }else if (tasks_cuda[tid].type != type_ghost ){
            fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]);
         }
         __syncthreads();
         /*Stop the task clock*/
         if( threadIdx.x == 0 ) 
-            tasks_cuda[tid].toc = clock();
+            tasks_cuda[tid].toc = clock64();
 
         /*Unlocks*/
     #ifdef GPU_locks
         if(threadIdx.x == 0)
             cuda_done( &tasks_cuda[tid] );
-       // __syncthreads();
+        __syncthreads();
     #endif
         for(i = threadIdx.x; i < tasks_cuda[tid].nr_unlocks; i += blockDim.x )
         {
             if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip ))
             {
-                cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
+                /* Place unloads into highest priority queue, any other task goes to normal priority queue. Load tasks are never unlocked.*/
+                if(tasks_cuda[tasks_cuda[tid].unlocks[i]].type != type_unload)
+                {
+                    cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
+                }
+                else
+                    cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] );
             }
         }
     }
@@ -449,23 +466,30 @@ __global__ void qsched_device_kernel ( )
 /* Make a notch on the barrier, last one out cleans up the mess... */
     __syncthreads();
     if ( threadIdx.x == 0 )
-        tid = ( atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 );
+        done = ( atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 );
     __syncthreads();
-    if ( tid  ) {
+    if ( done ) {
         if ( threadIdx.x == 0 ) {
             cuda_barrier = 0;
+    
+            /* Reset the load task queue so load tasks are ready again.*/
             volatile int *temp = cuda_queues[1].data; cuda_queues[1].data = cuda_queues[1].rec_data; cuda_queues[1].rec_data = temp;
             
+
+            /* Reset values.*/
             cuda_queues[0].first = 0;
             cuda_queues[0].last = 0;
             cuda_queues[0].rec_count = 0;
             cuda_queues[1].first = 0;
-            cuda_queues[1].last = 0;
+            cuda_queues[1].last = cuda_queues[1].count;
             cuda_queues[1].rec_count = 0;
-
+            cuda_queues[2].first = 0;
+            cuda_queues[2].last = 0;
+            cuda_queues[2].rec_count = 0;
             }
 
-//TODO
+        //TODO
+        /* Resetting waits is done on the CPU at this time. This may be changed in future.*/
       /*  for ( int j = threadIdx.x ; j < cuda_nr_tasks ; j+= blockDim.x )
             for ( k = 0 ; k < tasks_cuda[j].nr_unlock ; k++ )
             {
@@ -479,7 +503,7 @@ __global__ void qsched_device_kernel ( )
 
 int maxVal( int *array, int size )
 {
-   int i, maxi=-32000;
+   int i, maxi=-3200000;
    for (i=0; i<size; i++)
    {
 	 if (array[i]>maxi)
@@ -506,30 +530,31 @@ int minVal( int *array, int size )
 void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted )
 {
     int i,j;
+    int task, utask;
     if(numChildren > 0 && size/numChildren > 128*sizeof(int))
     {
         /* Create dummy task for this resource and recurse to its children!*/
-        int task, utask;
         task = qsched_addtask( s, type_ghost, task_flag_none, NULL, 0 , 0 );
+        qsched_adduse(s, task, ID);
         s->res[ID].task = task;
         utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 );
+        qsched_adduse(s, task, ID);
         s->res[ID].task = utask;
         if(parent >= 0)
         {
             /* Create dependecy to parent. */ 
-            qsched_addunlock(s, task, s->res[parent].task );
-            qsched_addunlock(s, s->res[parent].utask, utask );
+            /*qsched_addunlock(s, task, s->res[parent].task );
+            qsched_addunlock(s, s->res[parent].utask, utask );*/
         }
         for(i = sorted[ID]; i < sorted[ID+1]; i++)
         {
             qsched_create_loads(s, i, s->res[res[i]].size, sorted[i+1]-sorted[i], ID, res, sorted);
         }
     }else{
-    int task,utask;
         task = qsched_addtask( s , type_load , task_flag_none, &ID, sizeof(int), 0 );
         s->res[ID].task = task;
         utask = qsched_addtask( s , type_unload, task_flag_none, &ID, sizeof(int), 0 );
-        s->res[ID].task = utask;
+        s->res[ID].utask = utask;
         /* Create load task for this resource and set its children to completed with this task.*/
         for( j = sorted[ID]; j < sorted[ID+1]; j++ )
         {
@@ -538,15 +563,221 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in
         /* If it has a parent then set the parents ghost task to be dependent on this.*/
         if(parent >= 0)
         {
-            qsched_addunlock(s, task, s->res[parent].task );
-            qsched_addunlock(s, s->res[parent].utask, utask);
+            /*qsched_addunlock(s, task, s->res[parent].task );
+            qsched_addunlock(s, s->res[parent].utask, utask);*/
+        }
+    }
+}
+
+#define MAX_DEPTH 0
+int transitive_use_unlocks(struct qsched *s, struct task *t, int res, int depth)
+{
+    int i;
+    for(i = 0; i < t->nr_uses; i++)
+    {
+        if(t->uses[i] == res)
+            return 1;
+    }
+    for(i = 0; i < t->nr_locks; i++)
+    {
+        if(t->locks[i] == res)
+            return 1;
+    }
+    if(depth >= MAX_DEPTH)
+    {
+//        printf("Max Depth reached\n");
+        return 0;
+    }
+    for(i = 0; i < t->nr_unlocks; i++)
+    {
+        if(transitive_use_unlocks(s, &s->tasks[t->unlocks[i]], res, depth + 1))
+            return 1;
+    }
+    return 0;
+}
+
+int transitive_use_locks(struct qsched *s, int tid, int res, int depth)
+{
+    int i,j;
+    struct task *new_t;
+    struct task *t = &s->tasks[tid];
+    for(i = 0; i < t->nr_uses; i++)
+    {
+        if(t->uses[i] == res)
+            return 1;
+    }
+    for(i = 0; i < t->nr_locks; i++)
+    {
+        if(t->locks[i] == res)
+            return 1;
+    }
+    if(depth >= MAX_DEPTH)
+    {
+//        printf("Max Depth reached\n");
+        return 0;
+    }
+    for(i = tid-1; i >= 0; i--)
+    {
+        new_t = &s->tasks[i];
+        for(j = 0; j < new_t->nr_unlocks; j++)
+        {
+            if(new_t->unlocks[j] == tid)
+            {
+                if(transitive_use_locks(s, i, res , depth + 1))
+                    return 1;
+            break;
+            }
         }
     }
+    return 0;
+}
+
+/*
+* Not used.
+*/
+void qsched_prepare_deps( struct qsched *s )
+{
+    int **is_loaded;
+    int **parents;
+    int i, k, j, use, usek, usem;
+    int *num_parents;
+    is_loaded = (int**)malloc(sizeof(int*) * s->count);
+    parents = (int**)malloc(sizeof(int*) * s->count);
+    num_parents = (int*)malloc(sizeof(int) * s->count);
+    bzero(num_parents, sizeof(int)*s->count);
+    k = (sizeof(int)*s->count_res)/32 +1;
+    for(i = 0; i < s->count; i++)
+    {
+        is_loaded[i] = (int*)malloc(k);
+        bzero(is_loaded[i], k);
+    }
+    /* Is loaded[i][k] gives the set of k*32 resources for task i*/
+
+    /* Reset the waits to 0... */
+   for( k = 0; k < s->count; k++ )
+    {
+        s->tasks[k].wait = 0;
+    }    
+
+    /* Run through the tasks and set the waits... */
+    for ( k = 0 ; k < s->count ; k++ ) {
+        struct task *t = &s->tasks[k];
+        if ( !( t->flags & task_flag_skip ) )
+            for ( j = 0 ; j < t->nr_unlocks ; j++ )
+                s->tasks[ t->unlocks[j] ].wait += 1;
+        }
+        
+    /* Sort the tasks topologically. */
+    int *tid = (int *)malloc( sizeof(int) * s->count );
+    for ( j = 0 , k = 0 ; k < s->count ; k++ )
+        if ( s->tasks[k].wait == 0 ) {
+            tid[j] = k;
+            j += 1;
+            }
+    for ( k = 0 ; k < j ; k++ ) {
+        struct task *t = &s->tasks[ tid[k] ];
+        for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
+            if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
+                tid[j] = t->unlocks[kk];
+                j += 1;
+                }
+        }
+    if ( k < s->count )
+    {
+        error( "Circular dependencies detected." );
+     }
+    int max_parents = 0;
+    for(i = s->count-1; i >= 0; i--)
+    {
+        for(j = 0; j < s->tasks[i].nr_unlocks; j++)
+        {
+            num_parents[s->tasks[i].unlocks[j]]++;
+        }
+    }
+    for(i = 0; i < s->count; i++)
+    {
+        if(num_parents[i] > 0)
+            parents[i] = (int*)calloc(num_parents[i],sizeof(int));
+        else
+            parents[i] = NULL;
+
+        if(num_parents[i] > max_parents)
+        {
+            max_parents = num_parents[i];
+        }
+
+        num_parents[i] = 0;
+    }
+
+    for(i = 0; i < s->count; i++)
+    {
+        if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
+            continue;
+        for(k = 0; k < s->tasks[i].nr_uses; k++)
+        {
+            use = s->tasks[i].uses[k];
+            usek = use >> 5; // use / 32;
+            usem = use & 31; // use % 32.
+
+            if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
+            {
+                qsched_addunlock(s, s->res[use].task , i ) ;
+                is_loaded[i][usek] |= (1 <<(31-usem));
+            }
+        }
+        for(k = 0; k < s->tasks[i].nr_unlocks; k++)
+        {
+            if(s->tasks[s->tasks[i].unlocks[k]].type == type_load ||
+                    s->tasks[s->tasks[i].unlocks[k]].type == type_unload )
+                continue;
+            for(j = 0; j < s->count_res/32 +1; j++)
+            {
+                    is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j];
+            }
+            parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i;
+            num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1;
+        }
+    }
+    max_parents = 0;
+    for(i = 0; i < s->count; i++)
+    {
+        if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
+            continue;
+        if(num_parents[i] > max_parents)
+        {
+            max_parents = num_parents[i];
+        }
+        bzero(is_loaded[i], k);
+    }
+    for(i = s->count-1; i >= 0; i--)
+    {
+        if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload)
+            continue;
+        for(k = 0; k < s->tasks[i].nr_uses; k++)
+        {
+            use = s->tasks[i].uses[k];
+            usek = use >> 5; // use / 32;
+            usem = use & 31; // use % 32.
+            if((is_loaded[i][usek] & (1 << (31-usem))) == 0 )
+            {   
+                qsched_addunlock(s, i, s->res[use].utask );
+                is_loaded[i][usek] |= (1 << (31-usem));
+            }
+        }
+        for(k = 0; k < num_parents[i]; k++)
+        {
+            for(j = 0; j < s->count_res/32 +1; j++)
+            {   
+                is_loaded[parents[i][k]][j] |= is_loaded[i][j];
+            }
+        }
+    }
+
 }
 
 void qsched_prepare_loads ( struct qsched *s ) {
 
-int i, task, unload, j , k , unlocked = 0;
+int i, task, unload, j , k , x, unlocked = 0;
 struct task *t;
 int *sorted, lastindex;
 int *res, *res_data;
@@ -556,6 +787,89 @@ if(s->res[0].task != -1)
     printf("Tasks already initialised, not redoing load/unload tasks");
     return;
 }
+double itpms = 1000.0 / CPU_TPS;
+ticks tic, toc_run ;
+    tic = getticks();
+
+/* Expand the deps array so we can add new dependencies in place. */
+qsched_task_t *deps_new, *deps_new_key;
+s->size_deps *= qsched_stretch;
+
+
+
+j = 0;
+//printf("%i\n", s->tasks[94].unlocks);
+//        for(k = 0; k < s->tasks[94].nr_unlocks; k++)
+//            printf("%i ", s->tasks[94].unlocks[k]);
+//        printf("\n");
+
+/* Allocate a new dependency list. */
+if ( ( deps_new = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL ||
+    ( deps_new_key = (int*) malloc(  s->size_deps * sizeof(int) ) ) == NULL )
+        error( "Failed to allocate new deps lists." );
+
+/* Copy the dependency list to the new list, leaving a space between each task to fit unload dependencies in.*/
+for(i = 0; i < s->count; i++)
+{
+    t = &s->tasks[i];
+    /* Its possible we might not fit in weird cases so we need to make sure to expand if needed.*/
+    if(j + t->nr_unlocks + t->nr_uses + t->nr_locks > s->size_deps)
+    {
+    //    printf("\n");
+        qsched_task_t *temp1, *temp2;
+        s->size_deps *= qsched_stretch;
+        /* Allocate a new task list. */
+        if ( ( temp1 = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL ||
+           ( temp2 = (int*) malloc(  s->size_deps * sizeof(int) ) ) == NULL )
+            error( "Failed to allocate new deps lists." );
+        /* Copy the deps and keys over to the new list. */
+        memcpy( temp1 , deps_new , sizeof(int) * /*s->count_deps*/ j);
+        memcpy( temp2 , deps_new_key , sizeof(int) * /*s->count_deps*/ j );
+        int m;
+        /* Have to update since we aren't resorting later.*/
+        for(m = 0; m < i; m++)
+        {
+
+            t = &s->tasks[m];
+            t->unlocks = &temp1[t->unlocks - deps_new];
+        }
+        free(deps_new);
+        free(deps_new_key);
+        deps_new = temp1;
+        deps_new_key = temp2;
+//        printf("Stretch at line 828. m = %i\n", m);
+    t = &s->tasks[i];
+    }
+    int start_j = j;
+
+    /*if(i <= 94)
+{
+        printf("%i\n", s->tasks[94].unlocks);
+        for(k = 0; k < s->tasks[94].nr_unlocks; k++)
+            printf("%i ", s->tasks[94].unlocks[k]);
+        printf(" i = %i\n", i);
+        printf("%i\n", t->unlocks);
+}*/
+    for(k = 0; k < t->nr_unlocks; k++)
+    {
+        deps_new[j] = t->unlocks[k];
+        //if(i == 48)
+          //  printf("%i %i %i %i\n", t->unlocks +k,s->deps_key, s->deps, s->count_deps);
+        deps_new_key[j] = s->deps_key[&t->unlocks[k] - s->deps];
+        j++;
+    }
+    t->unlocks = &deps_new[start_j];
+    /*if(i == 94)
+{
+        for(k = 0; k < t->nr_unlocks; k++)
+            printf("%i ", t->unlocks[k]);
+        printf("\n");
+}*/
+    j+=t->nr_uses + t->nr_locks;
+}
+
+s->count_deps = j;
+
 
 /* Store number of children for each resource*/
 sorted = (int*) malloc(sizeof(int) * (s->count_res+1));
@@ -601,12 +915,13 @@ for(i = 0; i < s->count_res; i++)
         mini = res_data[i]; 
 //    printf("%i ", res_data[i]);
 }
+
 for(i = 0; i < s->count_res; i++)
 {
     res_data[i] -= mini;
-printf("%i ", res_data[i]);
+//printf("%i ", res_data[i]);
 }
-printf("\n");
+//printf("\n");
 
 /* Sort the children of each parent by memory address. */
 qsched_sort(res, res_data, sorted[0], minVal(res_data, sorted[0]), maxVal(res_data, sorted[0]));
@@ -621,13 +936,13 @@ for(i = 1; i < s->count_res; i++)
 qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]));
 
 
-/*for(i = 0; i < s->count_res; i++)
+/*
+for(i = 0; i < s->count_res; i++)
 {
     printf("%i ", res_data[i]);
 }
 printf("\n");
 
-
 for(i = 0; i < s->count_res; i++)
 {
 printf("%i ", res[i]);
@@ -657,17 +972,20 @@ for(i = 1; i < s->count_res; i++)
     {
         for(j = sorted[i-1]; j < sorted[i]-1; j++)
         {
-            if(res_data[j] + s->res[res[j]].size > res_data[j+1])
+            if(res_data[j] + (s->res[res[j]].size/sizeof(int)) > res_data[j+1])
                 error("Overlapping resources are not allowed.");
         }
     }
 }
 
 /* Check super resources don't overlap.*/
-for( i = sorted[s->count_res-1]; i < s->count_res; i++ )
+for( i = sorted[s->count_res-1]; i < s->count_res-1; i++ )
 {
-    if(res_data[i] + s->res[res[i]].size > res_data[i+1])
+    if(res_data[i] + (s->res[res[i]].size/sizeof(int)) > res_data[i+1])
+    {
+        /*printf("i=%i, s->count_res=%i, res_data[i] = %i, size = %i, res_data[i]+size = %i, res_data[i+1] = %i\n",i, s->count_res, res_data[i], s->res[res[i]].size, res_data[i] + (s->res[res[i]].size/sizeof(int)), res_data[i+1]);*/
         error("Overlapping resources are not allowed.");
+    }
 }
 
 /* Reposition sorted pointers so that sorted[i] points to the first child of task ID= i*/
@@ -675,6 +993,11 @@ for(i = sorted[s->count_res]; i >= 0; i-- )
 {
     sorted[i] = sorted[i-1];
 }
+
+toc_run = getticks();
+//	message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+
+tic = getticks();
 /* If nothing overlaps create tasks.*/
 for( i = sorted[s->count_res]; i < s->count_res; i++ )
 {
@@ -688,7 +1011,11 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ )
 
 }
 
-/* Check all tasks have load tasks - if not give parents (recursively)*/
+toc_run = getticks();
+//	message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+
+
+/* Check all resources have load tasks - if not give parents (recursively)*/
 for(i = 0; i < s->count_res; i++ )
 {
     if( s->res[i].task == -1 )
@@ -714,41 +1041,355 @@ for(i = 0; i < s->count_res; i++ )
     }
 }
 
+int **usage_list;
+int *num_uses;
+int *size_uses;
+usage_list = (int**)malloc(sizeof(int*) * s->count_res);
+num_uses = (int*) malloc(sizeof(int) * s->count_res );
+size_uses = (int*) malloc(sizeof(int) * s->count_res);
+for(i = 0; i < s->count_res; i++ )
+{
+    usage_list[i] = (int*) malloc(sizeof(int) * s->count_uses / s->count_res + 1);
+    num_uses[i] = 0;
+    size_uses[i] = s->count_uses / s->count_res + 1;   
+}
 
+/* Add deps from tasks to unload tasks. */
+for(i = 0; i < s->count; i++)
+{
+    t = &s->tasks[i];
+    if(t-> type == type_unload || t->type == type_load || t->type == type_ghost )
+        continue;
 
-/* Set up dependencies with the rest of the system.*/
-for( i = 0; i < s->count_res; i++ )
+    for(j = 0; j < t->nr_uses; j++)
+    {
+        t->unlocks[t->nr_unlocks] = s->res[t->uses[j]].utask;
+        deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i;
+        t->nr_unlocks++;
+        if(num_uses[t->uses[j]] == size_uses[t->uses[j]])
+        {
+            /* Stretch. */
+            int* temp = (int*) malloc(sizeof(int) * size_uses[t->uses[j]] * 2 );
+            memcpy( temp, usage_list[t->uses[j]], sizeof(int) * size_uses[t->uses[j]]);
+            free(usage_list[t->uses[j]]);
+            usage_list[t->uses[j]] = temp;
+            size_uses[t->uses[j]] *=2;
+        }
+        usage_list[t->uses[j]][num_uses[t->uses[j]]++] = i;
+    }
+    for(j = 0; j < t->nr_locks; j++)
+    {
+        t->unlocks[t->nr_unlocks] = s->res[t->locks[j]].utask;
+        deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i;
+        t->nr_unlocks++;
+        if(num_uses[t->locks[j]] == size_uses[t->locks[j]])
+        {
+            /* Stretch. */
+            int* temp = (int*) malloc(sizeof(int) * size_uses[t->locks[j]] * 2 );
+            memcpy( temp, usage_list[t->locks[j]], sizeof(int) * size_uses[t->locks[j]]);
+            free(usage_list[t->locks[j]]);
+            usage_list[t->locks[j]] = temp;
+            size_uses[t->locks[j]] *=2;
+        }
+        usage_list[t->locks[j]][num_uses[t->locks[j]]++] = i;
+    }
+}
+
+/* Loop through resources. */
+
+// TODO Make sure to stretch if needed.
+for(i = 0; i < s->count_res; i++ )
 {
-    int unlocked = 0;
-    for( j = 0; j < s->count; j++ )
+    int ID = res[i];
+    int size = s->res[ID].size;
+    int numChildren = sorted[ID+1] - sorted[ID];
+    int parent = s->res[ID].parent;
+    struct res *resource = &s->res[ res[i] ];
+
+    /* Loop through children if there are any. */    
+    if(numChildren > 0)
     {
-        struct task *t = &s->tasks[j];
-        for(k = 0; k < t->nr_uses; k++)
+        /* Do unload task stuff first. */
+        s->tasks[resource->utask].unlocks = &deps_new[s->count_deps];
+        s->tasks[resource->utask].nr_unlocks = 0;
+        if(s->count_deps + numChildren > s->size_deps)
         {
-            if(t->uses[k] == i)
+            qsched_task_t *temp1, *temp2;
+            s->size_deps *= qsched_stretch;
+            /* Allocate a new task list. */
+            if ( ( temp1 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL ||
+                 ( temp2 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL )
+                error( "Failed to allocate new deps lists." );
+            /* Copy the deps and keys over to the new list. */
+            memcpy( temp1 , deps_new , sizeof(int) * s->count_deps );
+            memcpy( temp2 , deps_new_key , sizeof(int) * s->count_deps );
+            int m;
+            /* Have to update since we aren't resorting later.*/
+            for(m = 0; m < s->count; m++)
             {
-                qsched_addunlock(s, s->res[i].task ,j);
-                qsched_addunlock(s, j, s->res[i].utask);
-                unlocked = 1;
-                break;
+                t = &s->tasks[m];
+                t->unlocks = &temp1[t->unlocks - deps_new];
             }
+            free(deps_new);
+            free(deps_new_key);
+            deps_new = temp1;
+            deps_new_key = temp2;
+//        printf("Stretch at line 1102, m = %i.\n", m);
         }
-        if(unlocked == 1)
+        for(j = 0; j < numChildren; j++)
         {
-            unlocked = 0;
-            continue;
+            struct res *child = &s->res[ res[sorted[ ID ]+j] ];
+
+            if( child->utask != resource->utask )
+            {
+                s->tasks[resource->utask].unlocks[ s->tasks[resource->utask].nr_unlocks ] = child->utask;
+                deps_new_key[s->count_deps] = resource->utask;
+                s->tasks[resource->utask].nr_unlocks += 1;
+                s->count_deps += 1;
+            }
         }
-        for(k = 0; k < t->nr_locks; k++)
+    }
+
+    /* Do load task stuff. */
+
+    s->tasks[resource->task].unlocks = &deps_new[s->count_deps];
+    s->tasks[resource->task].nr_unlocks = 0;
+    if(numChildren > 0)
+    {
+        for(j = 0; j < numChildren; j++)
         {
-            if(t->locks[k] == i)
+            struct res *child = &s->res[ res[sorted[ ID ]+j] ];
+            if( child->utask == resource->utask )
             {
-                qsched_addunlock(s, s->res[i].task ,j);
-                qsched_addunlock(s, j, s->res[i].utask);
-                break;
+                if( s->size_deps < s->count_deps + num_uses[res[ sorted[ ID ]  + j ] ] )
+                {
+                    qsched_task_t *temp1, *temp2;
+                    s->size_deps *= qsched_stretch;
+                    /* Allocate a new task list. */
+                    if ( ( temp1 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL ||
+                         ( temp2 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL )
+                        error( "Failed to allocate new deps lists." );
+                    /* Copy the deps and keys over to the new list. */
+                    memcpy( temp1 , deps_new , sizeof(int) * s->count_deps );
+                    memcpy( temp2 , deps_new_key , sizeof(int) * s->count_deps );
+                    int m;
+                    /* Have to update since we aren't resorting later.*/
+                    for(m = 0; m < s->count; m++)
+                    {
+                        t = &s->tasks[m];
+                        t->unlocks = &temp1[t->unlocks - deps_new];
+                    }
+                    free(deps_new);
+                    free(deps_new_key);
+                    deps_new = temp1;
+                    deps_new_key = temp2;
+//                    printf("Stretch at line 1151.\n");
+                }                
+                for(k = 0; k < num_uses[ res[ sorted[ ID ] + j ] ]; k++)
+                {
+                    s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k];
+                    deps_new_key[s->count_deps] = resource->task;
+                    s->tasks[resource->task].nr_unlocks += 1;
+                    s->count_deps += 1;
+                }       
             }
         }
     }
+    if( s->size_deps < s->count_deps + 1 + num_uses[ res[ i ] ])
+    {
+            qsched_task_t *temp1, *temp2;
+            s->size_deps *= qsched_stretch;
+            /* Allocate a new task list. */
+            if ( ( temp1 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL ||
+                 ( temp2 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL )
+                error( "Failed to allocate new deps lists." );
+            /* Copy the deps and keys over to the new list. */
+            memcpy( temp1 , deps_new , sizeof(int) * s->count_deps );
+            memcpy( temp2 , deps_new_key , sizeof(int) * s->count_deps );
+            int m;
+            /* Have to update since we aren't resorting later.*/
+            for(m = 0; m < s->count; m++)
+            {
+                t = &s->tasks[m];
+                t->unlocks = &temp1[t->unlocks - deps_new];
+            }
+            free(deps_new);
+            free(deps_new_key);
+            deps_new = temp1;
+            deps_new_key = temp2;
+//        printf("Stretch at line 1185.\n");
+    }
+    if( parent > 0 )
+    {
+        s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task;
+        deps_new_key[s->count_deps] = resource->task;
+        s->tasks[resource->task].nr_unlocks += 1;
+        s->count_deps += 1;
+    }
+
+    for(k = 0; k < num_uses[ res[ i ] ] ; k ++ )
+    {
+        s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k];
+        deps_new_key[s->count_deps] = resource->task;
+        s->tasks[resource->task].nr_unlocks += 1;
+        s->count_deps += 1;
+    }
+
 }
+//printf("s->deps = %i, deps_new = %i\n", s->deps, deps_new);
+free(s->deps);
+free(s->deps_key);
+s->deps = deps_new;
+s->deps_key = deps_new_key;
+s->flags &= ~qsched_flag_dirty;
+tic = getticks();
+//printf("Number tasks = %i\n", s->count);
+//printf("Number dependencies = %i\n", s->count_deps);
+/* Set up dependencies with the rest of the system.*/
+
+
+
+/*New version*/
+    /* Reset the waits to 0... */
+/*    for( k = 0; k < s->count; k++ )
+    {
+        s->tasks[k].wait = 0;
+    }    */
+
+    /* Run through the tasks and set the waits... */
+/*    for ( k = 0 ; k < s->count ; k++ ) {
+        struct task *t = &s->tasks[k];
+        if ( !( t->flags & task_flag_skip ) )
+            for ( j = 0 ; j < t->nr_unlocks ; j++ )
+                s->tasks[ t->unlocks[j] ].wait += 1;
+        }
+        */
+    /* Sort the tasks topologically. */
+/*    int *tid = (int *)malloc( sizeof(int) * s->count );
+    for ( j = 0 , k = 0 ; k < s->count ; k++ )
+        if ( s->tasks[k].wait == 0 ) {
+            tid[j] = k;
+            j += 1;
+            }
+    for ( k = 0 ; k < j ; k++ ) {
+        struct task *t = &s->tasks[ tid[k] ];
+        for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
+            if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
+                tid[j] = t->unlocks[kk];
+                j += 1;
+                }
+        }
+    if ( k < s->count )
+    {
+        //printf("k = %i, count = %i\n", k, count);
+        error( "Circular dependencies detected." );
+     }
+
+*/
+/*Do unlocks */
+/*for(i = 0; i < s->count; i++)
+{
+    struct task *t = &s->tasks[i];
+    int result = 0;
+    if(t->type == type_ghost || t->type == type_unload || t->type == type_load)
+        continue;
+    for(j = 0; j < t->nr_uses; j++)
+    {
+        result = 0;
+        for(k = 0; k < t->nr_unlocks && result == 0; k++)
+        {
+            result = transitive_use_unlocks(s, &s->tasks[t->unlocks[k]], t->uses[j],0);
+                
+
+        }
+        if( result == 0)
+            qsched_addunlock(s, i, s->res[t->uses[j]].utask);
+    }
+    for(j = 0; j < t->nr_locks; j++)
+    {
+        result = 0;
+        for(k = 0; k < t->nr_unlocks && result == 0; k++)
+        {
+            result =transitive_use_unlocks(s, &s->tasks[t->unlocks[k]], t->locks[j],0);
+
+        }
+        if(result == 0)
+            qsched_addunlock(s, i, s->res[t->locks[j]].utask);
+    }
+}*/
+
+/*Do locks */
+/*for(i = s->count-1; i >= 0; i--)
+{
+    struct task *t = &s->tasks[i];
+    struct task *new_t;
+    int result = 0;
+    if(t->type == type_ghost || t->type == type_unload || t->type == type_load)
+        continue;
+    for(j = 0; j < t->nr_uses; j++)
+    {
+        result = 0;
+        for(k = i-1; k >= 0; k--)
+        {
+            new_t = &s->tasks[k];
+            for(x = 0; x < new_t->nr_unlocks && result == 0; x++)
+            {
+                if(new_t->unlocks[x] == i)
+                {
+                    result = transitive_use_locks(s, k, t->uses[j],0);
+                }
+            }
+        }
+        if(result == 0)
+        {
+            qsched_addunlock(s, s->res[t->uses[j]].task, i);
+        } 
+    }
+    for(j = 0; j < t->nr_locks; j++)
+    {
+        result = 0;
+        for(k = i-1; k >= 0; k--)
+        {
+            new_t = &s->tasks[k];
+            for(x = 0; x < new_t->nr_unlocks && result == 0; x++)
+            {
+                if(new_t->unlocks[x] == i)
+                {
+                    result = transitive_use_locks(s, k, t->locks[j],0);
+                }
+            }
+        }
+        if(result == 0)
+        {
+            qsched_addunlock(s, s->res[t->locks[j]].task, i);
+        } 
+    }
+   
+}*/
+
+/* Old version*/
+/*for(i = 0; i < s->count; i++)
+{
+    struct task *t = &s->tasks[i];
+    if(t->type == type_load || t->type == type_unload || t-> type == type_ghost)
+        continue;
+
+    for(k = 0; k < t->nr_uses; k++)
+    {
+        qsched_addunlock(s, s->res[t->uses[k]].task, i);
+        qsched_addunlock(s, i, s->res[t->uses[k]].utask);
+    }
+
+    for(k = 0; k < t->nr_locks; k++)
+    {
+        qsched_addunlock(s, s->res[t->locks[k]].task, i);
+        qsched_addunlock(s, i, s->res[t->locks[k]].utask);
+    }
+}*/
+//qsched_prepare_deps( s );
+//printf("Number dependencies = %i\n", s->count_deps);
+toc_run = getticks();
+//	message( "Setting up dependencies took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 //error("Got to here");
 }
 
@@ -770,7 +1411,6 @@ for(i = 0; i < s->count_res; i++)
 //    cudaMalloc( &s->res[ i ].gpu_data, s->res[i].size );
     task = qsched_addtask( s , type_load , task_flag_none , &i , sizeof(int) , 0 );
     s->res[i].task = task;
-    printf("s->res[i].task = %i\n", s->res[i].task);
     unload = qsched_addtask( s , type_unload, task_flag_none , &i, sizeof(int), 0 );
     /*Load task unlocks each task that uses or locks the specified resource */
     /*Unload task is unlocked by each task that is unlocked by the load task. */
@@ -869,11 +1509,26 @@ char *sdata;
         /* All cleaned-up now! */
         //s->flags &= ~qsched_flag_dirty;
     
+
         }
 
-qsched_prepare_loads(s);
+    double itpms = 1000.0 / CPU_TPS;
+ticks tic, toc_run ;
+    tic = getticks();
+//qsched_ensure(s, s->count + 2*s->count_res, s->count_res, s->count_deps + 2*s->count_res, s->count_locks, s->count_uses + 2*s->count_res, s->size_data );
 
+ for ( k = 0 ; k < count ; k++ ) {
+        t = &tasks[k];
+        if ( !( t->flags & task_flag_skip ) )
+            for ( j = 0 ; j < t->nr_unlocks ; j++ )
+            {
+                tasks[ t->unlocks[j] ].wait += 1;
+            }
+        }
 
+qsched_prepare_loads(s);
+toc_run = getticks();
+//	message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     
 
     
@@ -887,22 +1542,22 @@ qsched_prepare_loads(s);
     if ( s->flags & qsched_flag_dirty ) {
     
         /* Do the sorts in parallel, if possible. */
-        #pragma omp parallel
-        {
+       // #pragma omp parallel
+        //{
     
             /* Sort the unlocks. */
-            #pragma omp single nowait
-            qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 );
+            //#pragma omp single nowait
+           // qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 );
 
             /* Sort the locks. */
-            #pragma omp single nowait
-            qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
+        //    #pragma omp single nowait
+         //   qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
 
             /* Sort the uses. */
-            #pragma omp single nowait
-            qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 );
+           // #pragma omp single nowait
+           // qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 );
             
-        }
+      //  }
         
         /* Run throught the tasks and link the locks and unlocks. */
         tasks[0].unlocks = s->deps;
@@ -918,7 +1573,6 @@ qsched_prepare_loads(s);
         s->flags &= ~qsched_flag_dirty;
     
         }
-        
     /* Init the queues. */
     for ( k = 0 ; k < s->nr_queues ; k++ )
         queue_init( &s->queues[k] , count );
@@ -929,46 +1583,107 @@ qsched_prepare_loads(s);
         tasks[k].wait = 0;
     }    
 
+    int* store_waits;
+    if( (store_waits = (int*) malloc(sizeof(int)*s->count) ) == NULL)
+        error("Failed to allocate store_waits");
+
     /* Run through the tasks and set the waits... */
     for ( k = 0 ; k < count ; k++ ) {
         t = &tasks[k];
         if ( !( t->flags & task_flag_skip ) )
             for ( j = 0 ; j < t->nr_unlocks ; j++ )
+            {
                 tasks[ t->unlocks[j] ].wait += 1;
+            }
         }
-
         
     /* Sort the tasks topologically. */
     int *tid = (int *)malloc( sizeof(int) * count );
     for ( j = 0 , k = 0 ; k < count ; k++ )
+    {
+        store_waits[k] = tasks[k].wait;
         if ( tasks[k].wait == 0 ) {
             tid[j] = k;
             j += 1;
             }
+    }
     for ( k = 0 ; k < j ; k++ ) {
         t = &tasks[ tid[k] ];
         for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
+        {
             if ( ( tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
                 tid[j] = t->unlocks[kk];
                 j += 1;
                 }
         }
+        }
+
+        /* Print all dependencies */
+        /*for(i = 0; i < count; i++ )
+        {
+            printf("Task ID: %i, type=%i, ", i, tasks[i].type);
+            for(j = 0; j < tasks[i].nr_unlocks; j++)
+            {
+                printf("%i ", tasks[i].unlocks[j]);
+            }
+            printf("\n");
+        }*/
     if ( k < count )
+    {
+        printf("k = %i, wait = %i\n", tid[k-1], tasks[tid[k-1]].wait);
+        printf("tasks[0].nr_unlocks=%i\n", tasks[0].nr_unlocks);
+        for(i = 0; i < k-1; i++)
+        {
+            t = &tasks[tid[i]];
+            for(j = 0; j < t->nr_unlocks; j++)
+            {
+                if(t->unlocks[j] == tid[k-1])
+                    printf("%i ", tid[i]);
+
+            }
+        }
+        printf("\n");
+        for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++)
+        {
+            printf("%i ", tasks[tid[k-1]].unlocks[i]);
+        }
+        printf("\n");
+        for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++)
+        {
+            printf("unlocks[%i] = %i wait = %i\n", i, tasks[tid[k-1]].unlocks[i], tasks[tasks[tid[k-1]].unlocks[i]].wait);
+            for(j = 0; j < tasks[tasks[tid[k-1]].unlocks[i]].nr_unlocks; j++)
+            {
+                printf("%i ", tasks[tasks[tid[k-1]].unlocks[i]].unlocks[j]);
+            }
+            printf("\n");
+        }
+        printf("\n\n");
+        for(i = 0; i < k; i++)
+        {
+            if(tasks[tid[i]].type == 0)
+            {
+                int* idata = (int*)&(s->data[tasks[tid[i]].data]);
+                printf("%i %i,", tid[i], idata[0]);
+            }
+        }
+        printf("\n");
         error( "Circular dependencies detected." );
-        
+     }   
     /* Run through the topologically sorted tasks backwards and
        set their weights, re-setting the waits while we're at it. */
     for ( k = count-1 ; k >= 0 ; k-- ) {
         int maxweight = 0;
         t = &tasks[ tid[k] ];
+        t->wait = store_waits[tid[k]];
         for ( j = 0 ; j < t->nr_unlocks ; j++ ) {
-            tasks[ t->unlocks[j] ].wait += 1;
             if ( tasks[ t->unlocks[j] ].weight > maxweight )
                 maxweight = tasks[ t->unlocks[j] ].weight;
             }
         t->weight = t->cost + maxweight;
         }
 
+    free(store_waits);
+
 
 /*Allocate temporary tasks to setup device tasks*/
 temp = (struct task *) malloc(s->count * sizeof(struct task));
@@ -1023,16 +1738,6 @@ if( cudaMemcpyToSymbol ( uses_cuda, &setup_u ,  sizeof(qsched_res_t *), 0 ,  cud
 if( cudaMalloc( &res_t , sizeof(struct res) * s->count_res ) != cudaSuccess )
     error("Failed to allocated on the device.");
 
-for(i = 0; i < s->count_res; i++)
-{   
- //   if(s->res[i].size == 0)
-   //     continue;
-    //if( cudaMalloc( &data, s->res[i].size) != cudaSuccess )
-      //  error("Failed to allocate data space on the device.");
-    //s->res[i].gpu_data = data;
-}
-
-
 if( cudaMemcpy( res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess )
     error("Failed to copy resources to the device: %s", cudaGetErrorString(cudaPeekAtLastError()));
 if( cudaMemcpyToSymbol( res_cuda , &res_t , sizeof(struct res *) , 0 , cudaMemcpyHostToDevice) != cudaSuccess )
@@ -1056,21 +1761,9 @@ if( cudaMemcpy( cuda_t, temp, sizeof(struct task) * s->count , cudaMemcpyHostToD
 if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t ,  sizeof(struct task *) , 0 ,  cudaMemcpyHostToDevice ) != cudaSuccess )
     error("Failed to copy task pointer to the device.");
 
-
-
-
-/*for(i = 0; i < s->count; i++)
-{
-    printf("Task:%i of type %i has wait %i\n", i, temp[i].type, temp[i].wait);
-}*/
-
-
-
-
-
 /* Initialize the queues. */
-int nr_queues= 2,qsize;
-//int *data;
+int nr_queues= 3,qsize;
+int *data2;
 struct queue_cuda queues[ cuda_numqueues ];
 
  qsize = max(2*s->count / nr_queues, 256);
@@ -1080,14 +1773,21 @@ struct queue_cuda queues[ cuda_numqueues ];
     /* Allocate a temporary buffer for the queue data. */
         if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL )
             error("Failed to allocate data buffer.");
+        if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL )
+            error("Failed to allocate data2 buffer.");
 
     queues[1].count = 0;
     for(i = 0; i < s->count; i++)
     {
         if(temp[i].type == type_load && temp[i].wait == 0)
+        {
             data[queues[1].count++] = i;
+            data2[queues[1].count-1] = -temp[i].weight;
+        }
     
     }
+    qsched_sort(data, data2, queues[1].count, minVal(data2,queues[1].count), maxVal(data2, queues[1].count));
+    free(data2);
      for ( k = queues[1].count ; k < qsize ; k++ )
         data[k] = -1;  
     /* Allocate and copy the data. */
@@ -1109,24 +1809,57 @@ struct queue_cuda queues[ cuda_numqueues ];
         error("Failed to allocate queue data on the device.");
     if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
         error("Failed to copy queue data pointer to the device");
+/* Allocate and copy the recyling data. */
+    if ( cudaMalloc( &queues[2].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[2].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
         
     /* Set some other values. */
     queues[1].first = 0;
     queues[1].last = queues[1].count;
+    queues[1].nr_avail_tasks = queues[1].last;
     queues[1].rec_count = 0;  
 
+    /* Init queue 2*/
+    int num_unload=0;
+    queues[2].count = 0;
+    for( k = 0; k < s->count; k++ )
+    {
+        if(temp[k].type == type_unload)
+        {
+            num_unload++;
+            if(temp[k].wait == 0)
+                data[queues[2].count++] = k;
+        }
+    }
+    queues[2].first = 0;
+    queues[2].last = queues[2].count;
+    queues[2].nr_avail_tasks = queues[2].last;
+    queues[2].count = num_unload;
+    queues[2].rec_count = 0;
+    /* Allocate and copy the data. */
+    if ( cudaMalloc( &queues[2].data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[2].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+
+    for ( k = 0; k < qsize; k++ )
+            data[k] = -1;
+
     /* Init queue 0*/
     queues[0].count = 0;
     for ( k = 0; k < s->count ; k++ )
     {
-        if(temp[k].type != type_load && temp[k].wait == 0){
+        if(temp[k].type != type_load && temp[k].type != type_unload && temp[k].wait == 0){
             data[queues[0].count++] = k;
     }
 
     }
     queues[0].first = 0;
     queues[0].last = queues[0].count;
-    queues[0].count = s->count - queues[1].count;
+    queues[0].nr_avail_tasks = queues[0].last;
+    queues[0].count = s->count - queues[1].count - queues[2].count;
     queues[0].rec_count = 0;    
 
 
@@ -1142,7 +1875,6 @@ struct queue_cuda queues[ cuda_numqueues ];
 
     /* Clean up. */
     free( tid );
-    printf("Queue[0].count = %i , queue[1].count = %i\n", queues[0].count, queues[1].count);
         
     /* Set the number of waiting tasks. */
     s->waiting = count;
@@ -1152,8 +1884,6 @@ struct queue_cuda queues[ cuda_numqueues ];
 
     free( temp );
     free( data );
-    /* Unlock the sched. */
-    //lock_unlock_blind( &s->lock );
     
 }
 
@@ -1175,40 +1905,36 @@ struct task* qsched_get_timers( struct qsched *s, int numtasks )
     return cuda_tasks;
 }
 
-/*void qsched_get_CUDA_tasks ( struct qsched *s , struct task *cuda_tasks , int numtasks )
-{
-
-}*/
-
 /**
  * @brief Execute all the tasks in the current scheduler using
- *        OpenMP.
+ *        CUDA.
  *
  * @param s Pointer to the #qsched.
  * @param fun User-supplied function that will be called with the
  *        task type and a pointer to the task data. This must be a __device__ function!
  *
  * This function is only available if QuickSched was compiled with
- * OpenMP support.
+ * CUDA support.
  */
 void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) {
 
 #ifdef WITH_CUDA
+    ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out");
     double itpms = 1000.0 / CPU_TPS;
     ticks tic, toc_run ;
     tic = getticks();
     qsched_prepare_cuda( s );
   toc_run = getticks(); 
-	message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+//	message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms );
     cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype));
     tic = getticks();
-    qsched_device_kernel<<<1, 128 >>> (  );
+    ProfilerStop();
+    cudaMemcpyToSymbol( tot_num_tasks, &s->count, sizeof(int) );
+    qsched_device_kernel<<<128, 128 >>> (  );
     if( cudaDeviceSynchronize() != cudaSuccess )
         error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError()));
   toc_run = getticks(); 
-	message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms );
-
-    // TODO Unload timers (optional).
+    message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms );
 
 #else
         error("QuickSched was not compiled with CUDA support.");
diff --git a/src/cuda_queue.h b/src/cuda_queue.h
index 27f41c5f32674b8bdbc1c6249b44ccb60b39e161..b548f8ca512c7844b35609e85cc933f60a27acc7 100644
--- a/src/cuda_queue.h
+++ b/src/cuda_queue.h
@@ -16,7 +16,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * 
  ******************************************************************************/
-#define cuda_numqueues 2
+#define cuda_numqueues 3
 
 /** Type definition for the execution function in #qsched_run. */
 typedef void (*qsched_funtype)( int , void * );
@@ -38,6 +38,8 @@ struct queue_cuda {
     
     /* The recycling list. */
     volatile int *rec_data;
+    
+    volatile int nr_avail_tasks;
 
     };
 
diff --git a/src/qsched.c b/src/qsched.c
index cbc148424029ca39014c752194fc8a41d766d1dc..eebbfdfc1be395b66e6d1602fe6bdb3ebeaf2ff9 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -95,7 +95,7 @@ void qsched_res_own ( struct qsched *s , qsched_res_t res , int owner ) {
 void qsched_ensure ( struct qsched *s , int nr_tasks , int nr_res , int nr_deps , int nr_locks , int nr_uses , int size_data ) {
 
     int dirty = 0;
-    
+    printf("Called qsched_ensure.\n");
     /* Re-allocate tasks? */
     if ( s->size < nr_tasks ) {
         dirty = 1;
@@ -787,71 +787,6 @@ struct task* qsched_gettask ( struct qsched *s , int qid ) {
 
     }
 
-/**
- * @brief Sort the data according to the given indices.
- *
- * @param data The data to be sorted
- * @param ind The indices with respect to which the data are sorted.
- * @param N The number of entries
- * @param min Lowest index.
- * @param max highest index.
- *
- * This function calls qsched_quicksort.
- */
- 
-void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) {
-    int *new_data;
-    int *new_ind;
-    int i;
-    if(N <= 0)
-        return;
-    new_data = (int*)malloc(sizeof(int) * N);
-    if(new_data == NULL)
-        error("Failed to allocate new_data");
-    new_ind = (int*)malloc(sizeof(int) * N);
-    if(new_ind == NULL)
-        error("Failed to allocate new_ind");
-
-    /*Create buckets of size ? - Ideally <16 elements per bucket. Use max-min / N * 10 ? Should give average of 10 elements per bucket */
-    int bucketsize = 1;
-
-    /* To find bucket do ind-min / b and it goes in that bucket.*/
-    int num_buckets = (max-min) / bucketsize +1;
-    int *bucket_inds = (int*) malloc(sizeof(int) * num_bucket);
-    if(bucket_inds == NULL)
-        error("Failed to allocate bucket_inds");
-    memset(bucket_inds,0, sizeof(int)*num_buckets);
-    for(i = 0; i < N; i++)
-    {
-        bucket_inds[(ind[i]-min)]++;
-    }
-    for(i = 1; i < num_buckets; i++ )
-    {
-        bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
-    }
-    /* bucket_inds[i] contains the starting position for the i+1th bucket*/
-    for(i = num_buckets-1; i >0; i--)
-    {
-        bucket_inds[i] = bucket_inds[i-1];
-    }
-    bucket_inds[0] = 0;
-
-    for(i = 0; i < N; i++)
-    {
-        int z = (ind[i]-min);
-        new_data[bucket_inds[z]] = data[i];
-        new_ind[bucket_inds[z]++] = ind[i];
-    }
-
-    /* Copy data back to data and ind and deallocate everything!*/
-    memcpy(data, new_data, N*sizeof(int));
-    memcpy(ind, new_ind, N*sizeof(int));
-    free(new_data);
-    free(new_ind);
-    free(bucket_inds);
-
-}
-
 /**
  * @brief Sort the data according to the given indices.
  *
@@ -872,7 +807,6 @@ void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min
     
     /* If N is small enough, just do insert sort. */
     if ( N < 16 ) {
-        printf("%i\n", N);
         for ( i = 1 ; i < N ; i++ )
             if ( ind[i] < ind[i-1] ) {
                 temp_i = ind[i];
@@ -934,6 +868,167 @@ void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min
         
     }
 
+int log_base2(unsigned int v)
+{
+const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+const unsigned int S[] = {1, 2, 4, 8, 16};
+int i;
+
+register unsigned int r = 0; // result of log2(v) will go here
+for (i = 4; i >= 0; i--) // unroll for speed...
+{
+  if (v & b[i])
+  {
+    v >>= S[i];
+    r |= S[i];
+  } 
+}
+return r;
+}
+/**
+ * @brief Sort the data according to the given indices.
+ *
+ * @param data The data to be sorted
+ * @param ind The indices with respect to which the data are sorted.
+ * @param N The number of entries
+ * @param min Lowest index.
+ * @param max highest index.
+ *
+ * This function calls qsched_quicksort.
+ */
+ 
+void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) {
+    int *new_data;
+    int *new_ind;
+    int i, j, temp_i, temp_d;
+  //  printf("N=%i, max =%i, min = %i\n", N, max, min);
+    if(N <= 0)
+        return;
+ //   if((max-min) > 10*N)
+  //  {
+   //     qsched_quicksort(data, ind, N, min, max);
+   //     return;
+   // }
+//    printf("sort size: %i, max-min = %i\n",N, max-min);
+    /* If N is small enough, just do insert sort. */
+    if ( N < 11 ) {
+//        printf("%i\n", N);
+        for ( i = 1 ; i < N ; i++ )
+            if ( ind[i] < ind[i-1] ) {
+                temp_i = ind[i];
+                temp_d = data[i];
+                for ( j = i ; j > 0 && ind[j-1] > temp_i ; j-- ) {
+                    ind[j] = ind[j-1];
+                    data[j] = data[j-1];
+                    }
+                ind[j] = temp_i;
+                data[j] = temp_d;
+                }
+        return;
+        }
+
+    if((max-min) > 8*N)
+    {
+        int log = log_base2(max-min)-2;
+        log = log_base2((max-min)/8);
+    //    int bucketsize = 1 << log;
+        /* Each bucket contains (max-min)/10 width.*/
+        new_data = (int*)malloc(sizeof(int)*N);
+        new_ind = (int*)malloc(sizeof(int)*N);
+        int num_buckets = ((max-min) >> log) +1;
+        int *bucket_inds = (int*)malloc(sizeof(int)*num_buckets);
+        memset(bucket_inds, 0, sizeof(int)*num_buckets);
+        for(i = 0; i < N; i++)
+        {
+           // printf("%i\n", (ind[i]-min) >> log);
+            if(((ind[i]-min) >> log ) > num_buckets)
+                error("Exceeded num_buckets, ind[i]-min = %i, log = %i, ind[i]-min>>log=%i, ind[i] = %i, max = %i, num_buckets = %i, N=%i", ind[i]-min, log, (ind[i]-min)>>log, ind[i], max, num_buckets, N);
+            bucket_inds[(ind[i]-min) >> log]++;
+        }
+        for(i = 1; i < num_buckets; i++)
+        {
+            bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
+        }
+        for(i = num_buckets-1; i > 0; i--)
+        {
+            bucket_inds[i] = bucket_inds[i-1];
+        }
+        bucket_inds[0] = 0;
+        for(i = 0; i < N; i++)
+        {
+            int z = (ind[i]-min) >> log;
+            if(ind[i] > max || ind[i] < min)
+                error("ind[i]=%i, max = %i, min = %i\n", ind[i], max, min);
+
+            new_data[bucket_inds[z]] = data[i];
+            new_ind[bucket_inds[z]++] = ind[i];
+        }
+        memcpy(data, new_data, N*sizeof(int));
+        memcpy(ind, new_ind, N*sizeof(int));
+        free(new_data);
+        free(new_ind);
+        for(i = num_buckets-1; i > 0; i--)
+        {
+            bucket_inds[i] = bucket_inds[i-1];
+        }
+        bucket_inds[0] = 0;
+        /*Recusively sort each bucket. */
+        for(i = 0; i < num_buckets-1; i++)
+        {
+//            printf("bucket_inds[i] = %i, N = %i, i = %i, num_buckets = %i\n", bucket_inds[i], N, i, num_buckets);
+            qsched_sort(&data[bucket_inds[i]], &ind[bucket_inds[i]], bucket_inds[i+1] - bucket_inds[i], min + (1<<log)*i, min + (1<<log)*(i+1) );
+        }
+        return;
+    }
+    new_data = (int*)malloc(sizeof(int) * N);
+    if(new_data == NULL)
+        error("Failed to allocate new_data");
+    new_ind = (int*)malloc(sizeof(int) * N);
+    if(new_ind == NULL)
+        error("Failed to allocate new_ind");
+
+
+    int bucketsize = 1;
+
+    /* To find bucket do ind-min / b and it goes in that bucket.*/
+    int num_buckets = (max-min) / bucketsize +1;
+    int *bucket_inds = (int*) malloc(sizeof(int) * num_buckets);
+    if(bucket_inds == NULL)
+        error("Failed to allocate bucket_inds, max = %i, min = %i", max, min);
+    memset(bucket_inds,0, sizeof(int)*num_buckets);
+    for(i = 0; i < N; i++)
+    {
+        bucket_inds[(ind[i]-min)]++;
+    }
+    for(i = 1; i < num_buckets; i++ )
+    {
+        bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
+    }
+    /* bucket_inds[i] contains the starting position for the i+1th bucket*/
+    for(i = num_buckets-1; i >0; i--)
+    {
+        bucket_inds[i] = bucket_inds[i-1];
+    }
+    bucket_inds[0] = 0;
+
+    for(i = 0; i < N; i++)
+    {
+        int z = (ind[i]-min);
+        new_data[bucket_inds[z]] = data[i];
+        new_ind[bucket_inds[z]++] = ind[i];
+    }
+
+    /* Copy data back to data and ind and deallocate everything!*/
+    memcpy(data, new_data, N*sizeof(int));
+    memcpy(ind, new_ind, N*sizeof(int));
+    free(new_data);
+    free(new_ind);
+    free(bucket_inds);
+
+}
+
+
+
 
 /**
  * @brief Prepare a #qsched for execution.
@@ -1275,7 +1370,6 @@ void qsched_addunlock ( struct qsched *s , int ta , int tb ) {
     
     /* Do the deps need to be re-allocated? */
     if ( s->count_deps == s->size_deps ) {
-    
         /* Scale the deps list size. */
         s->size_deps *= qsched_stretch;
         
@@ -1337,7 +1431,6 @@ int qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *da
     
     /* Do the tasks need to be re-allocated? */
     if ( s->count == s->size ) {
-    
         /* Scale the task list size. */
         s->size *= qsched_stretch;
         
diff --git a/src/task.h b/src/task.h
index dccc2dae7e078510db30c7d368ad1c9769668978..99cd36dcb957f473c799036d2223f7c557dd441e 100644
--- a/src/task.h
+++ b/src/task.h
@@ -54,6 +54,10 @@ struct task {
     int qid;
     
     /* Task weight for queue selection. */
-    int cost, weight; 
+    int cost, weight;
+
+    #ifdef WITH_CUDA
+    int blockID;
+    #endif 
 
     };