diff --git a/autom4te.cache/requests b/autom4te.cache/requests
index 7675eba62587b513c5752f30ea1140a2981e22c3..6cd5e4a3a44e4986e168bef0c16985538948cd38 100644
--- a/autom4te.cache/requests
+++ b/autom4te.cache/requests
@@ -58,9 +58,9 @@
                         '_LT_AC_LANG_CXX_CONFIG' => 1,
                         'AM_PROG_MKDIR_P' => 1,
                         'DX_IF_FEATURE' => 1,
-                        'DX_FEATURE_doc' => 1,
                         'AM_AUTOMAKE_VERSION' => 1,
                         'DX_CHM_FEATURE' => 1,
+                        'DX_FEATURE_doc' => 1,
                         'DX_PDF_FEATURE' => 1,
                         'AM_SUBST_NOTMAKE' => 1,
                         'AM_MISSING_PROG' => 1,
@@ -68,11 +68,11 @@
                         '_LT_AC_LANG_C_CONFIG' => 1,
                         'AM_PROG_INSTALL_STRIP' => 1,
                         '_m4_warn' => 1,
-                        'AX_GCC_X86_CPUID' => 1,
                         'AC_LIBTOOL_OBJDIR' => 1,
+                        'AX_GCC_X86_CPUID' => 1,
                         'gl_FUNC_ARGZ' => 1,
-                        'LTOBSOLETE_VERSION' => 1,
                         'AM_SANITY_CHECK' => 1,
+                        'LTOBSOLETE_VERSION' => 1,
                         'AC_LIBTOOL_LANG_GCJ_CONFIG' => 1,
                         'AC_LIBTOOL_PROG_COMPILER_PIC' => 1,
                         'LT_LIB_M' => 1,
@@ -85,21 +85,21 @@
                         'AC_LIBTOOL_GCJ' => 1,
                         'DX_CLEAR_DEPEND' => 1,
                         '_LT_WITH_SYSROOT' => 1,
-                        'LT_SYS_DLOPEN_DEPLIBS' => 1,
                         'LT_FUNC_DLSYM_USCORE' => 1,
-                        'AC_LIBTOOL_CONFIG' => 1,
+                        'LT_SYS_DLOPEN_DEPLIBS' => 1,
                         '_LT_AC_LANG_F77' => 1,
-                        'AC_LTDL_DLLIB' => 1,
+                        'AC_LIBTOOL_CONFIG' => 1,
                         '_AM_SUBST_NOTMAKE' => 1,
+                        'AC_LTDL_DLLIB' => 1,
                         '_AM_AUTOCONF_VERSION' => 1,
                         'AM_DISABLE_SHARED' => 1,
                         '_LT_PROG_ECHO_BACKSLASH' => 1,
                         '_LTDL_SETUP' => 1,
-                        'AM_PROG_LIBTOOL' => 1,
                         '_LT_AC_LANG_CXX' => 1,
-                        'AM_PROG_LD' => 1,
-                        '_LT_AC_FILE_LTDLL_C' => 1,
+                        'AM_PROG_LIBTOOL' => 1,
                         'AC_LIB_LTDL' => 1,
+                        '_LT_AC_FILE_LTDLL_C' => 1,
+                        'AM_PROG_LD' => 1,
                         'DX_DOXYGEN_FEATURE' => 1,
                         'AU_DEFUN' => 1,
                         'AC_PROG_NM' => 1,
@@ -113,58 +113,58 @@
                         '_AM_SET_OPTION' => 1,
                         'AC_LTDL_PREOPEN' => 1,
                         '_LT_LINKER_BOILERPLATE' => 1,
-                        '_LT_PREPARE_SED_QUOTE_VARS' => 1,
-                        'AC_LIBTOOL_PROG_CC_C_O' => 1,
                         'AC_LIBTOOL_LANG_CXX_CONFIG' => 1,
+                        'AC_LIBTOOL_PROG_CC_C_O' => 1,
+                        '_LT_PREPARE_SED_QUOTE_VARS' => 1,
                         'gl_PREREQ_ARGZ' => 1,
                         'AX_EXT' => 1,
-                        'LT_SUPPORTED_TAG' => 1,
                         'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
-                        'LT_PROG_RC' => 1,
+                        'LT_SUPPORTED_TAG' => 1,
                         'LT_SYS_MODULE_EXT' => 1,
+                        'LT_PROG_RC' => 1,
                         'AC_DEFUN_ONCE' => 1,
                         'AX_CHECK_COMPILE_FLAG' => 1,
                         'DX_XML_FEATURE' => 1,
                         '_LT_AC_LANG_GCJ' => 1,
                         'AC_LTDL_OBJDIR' => 1,
-                        'DX_TEST_FEATURE' => 1,
                         '_LT_PATH_TOOL_PREFIX' => 1,
+                        'DX_TEST_FEATURE' => 1,
                         'AC_LIBTOOL_RC' => 1,
-                        '_LT_AC_PROG_ECHO_BACKSLASH' => 1,
-                        'AC_DISABLE_FAST_INSTALL' => 1,
                         'AM_SILENT_RULES' => 1,
+                        'AC_DISABLE_FAST_INSTALL' => 1,
+                        '_LT_AC_PROG_ECHO_BACKSLASH' => 1,
                         'DX_CHECK_DEPEND' => 1,
                         'DX_FEATURE_pdf' => 1,
-                        'include' => 1,
-                        'DX_REQUIRE_PROG' => 1,
-                        '_LT_AC_TRY_DLOPEN_SELF' => 1,
                         '_LT_AC_SYS_LIBPATH_AIX' => 1,
+                        '_LT_AC_TRY_DLOPEN_SELF' => 1,
+                        'DX_REQUIRE_PROG' => 1,
+                        'include' => 1,
                         'LT_AC_PROG_SED' => 1,
                         'AM_ENABLE_SHARED' => 1,
                         'DX_FEATURE_html' => 1,
                         'LTDL_INSTALLABLE' => 1,
-                        'DX_CURRENT_DESCRIPTION' => 1,
                         '_LT_AC_LANG_GCJ_CONFIG' => 1,
+                        'DX_CURRENT_DESCRIPTION' => 1,
                         'AC_ENABLE_SHARED' => 1,
-                        '_LT_REQUIRED_DARWIN_CHECKS' => 1,
-                        'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1,
                         'AC_ENABLE_STATIC' => 1,
-                        'AX_FUNC_POSIX_MEMALIGN' => 1,
-                        'AM_PROG_CC_C_O' => 1,
+                        'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1,
+                        '_LT_REQUIRED_DARWIN_CHECKS' => 1,
                         '_LT_AC_TAGVAR' => 1,
+                        'AM_PROG_CC_C_O' => 1,
+                        'AX_FUNC_POSIX_MEMALIGN' => 1,
                         'AC_LIBTOOL_LANG_F77_CONFIG' => 1,
                         'AM_CONDITIONAL' => 1,
                         'LT_LIB_DLLOAD' => 1,
                         'DX_FEATURE_dot' => 1,
-                        'LTVERSION_VERSION' => 1,
-                        '_LT_PROG_CXX' => 1,
-                        '_LT_PROG_F77' => 1,
                         'LTDL_INIT' => 1,
-                        'm4_include' => 1,
+                        '_LT_PROG_F77' => 1,
+                        '_LT_PROG_CXX' => 1,
+                        'LTVERSION_VERSION' => 1,
                         'AM_PROG_INSTALL_SH' => 1,
+                        'm4_include' => 1,
                         'AC_PROG_EGREP' => 1,
-                        'AC_PATH_MAGIC' => 1,
                         '_AC_AM_CONFIG_HEADER_HOOK' => 1,
+                        'AC_PATH_MAGIC' => 1,
                         'AC_LTDL_SYSSEARCHPATH' => 1,
                         'AM_MAKE_INCLUDE' => 1,
                         'LT_CMD_MAX_LEN' => 1,
@@ -180,51 +180,51 @@
                         'AC_PROG_LD_RELOAD_FLAG' => 1,
                         'DX_FEATURE_chi' => 1,
                         'AC_LTDL_DLSYM_USCORE' => 1,
-                        'AM_MISSING_HAS_RUN' => 1,
                         'LT_LANG' => 1,
+                        'AM_MISSING_HAS_RUN' => 1,
                         'LT_SYS_DLSEARCH_PATH' => 1,
                         'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_LIBTOOL_DLOPEN_SELF' => 1,
                         'LT_OUTPUT' => 1,
+                        'AC_LIBTOOL_DLOPEN_SELF' => 1,
                         'AC_LIBTOOL_PROG_LD_SHLIBS' => 1,
-                        'AC_WITH_LTDL' => 1,
                         'AC_LIBTOOL_LINKER_OPTION' => 1,
+                        'AC_WITH_LTDL' => 1,
                         'DX_CHI_FEATURE' => 1,
-                        'LT_AC_PROG_RC' => 1,
                         'AC_LIBTOOL_CXX' => 1,
+                        'LT_AC_PROG_RC' => 1,
                         'LT_INIT' => 1,
-                        'AX_CHECK_COMPILER_FLAGS' => 1,
-                        'LT_AC_PROG_GCJ' => 1,
                         'LT_SYS_DLOPEN_SELF' => 1,
+                        'LT_AC_PROG_GCJ' => 1,
+                        'AX_CHECK_COMPILER_FLAGS' => 1,
                         'DX_CURRENT_FEATURE' => 1,
-                        'AM_DEP_TRACK' => 1,
-                        'AM_DISABLE_STATIC' => 1,
                         '_LT_AC_PROG_CXXCPP' => 1,
-                        'AM_CONFIG_HEADER' => 1,
+                        'AM_DISABLE_STATIC' => 1,
+                        'AM_DEP_TRACK' => 1,
                         '_AC_PROG_LIBTOOL' => 1,
-                        'DX_HTML_FEATURE' => 1,
+                        'AM_CONFIG_HEADER' => 1,
                         '_AM_IF_OPTION' => 1,
+                        'DX_HTML_FEATURE' => 1,
                         'AC_PATH_TOOL_PREFIX' => 1,
-                        'm4_pattern_allow' => 1,
                         'AC_LIBTOOL_F77' => 1,
+                        'm4_pattern_allow' => 1,
                         'AM_SET_LEADING_DOT' => 1,
-                        '_LT_PROG_FC' => 1,
                         'LT_AC_PROG_EGREP' => 1,
-                        'DX_DIRNAME_EXPR' => 1,
+                        '_LT_PROG_FC' => 1,
                         '_AM_DEPENDENCIES' => 1,
+                        'DX_DIRNAME_EXPR' => 1,
                         'AC_LIBTOOL_LANG_C_CONFIG' => 1,
                         'LTOPTIONS_VERSION' => 1,
                         '_LT_AC_SYS_COMPILER' => 1,
-                        'DX_FEATURE_man' => 1,
                         'AM_PROG_NM' => 1,
+                        'DX_FEATURE_man' => 1,
                         'DX_PS_FEATURE' => 1,
                         'AC_LIBLTDL_CONVENIENCE' => 1,
                         'AC_DEPLIBS_CHECK_METHOD' => 1,
-                        'AC_LIBLTDL_INSTALLABLE' => 1,
                         'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1,
+                        'AC_LIBLTDL_INSTALLABLE' => 1,
                         'AC_LTDL_ENABLE_INSTALL' => 1,
-                        'LT_PROG_GCJ' => 1,
                         'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1,
+                        'LT_PROG_GCJ' => 1,
                         'DX_FEATURE_chm' => 1,
                         'AM_INIT_AUTOMAKE' => 1,
                         'DX_FEATURE_rtf' => 1,
@@ -235,32 +235,32 @@
                         '_LT_AC_LOCK' => 1,
                         '_LT_AC_LANG_RC_CONFIG' => 1,
                         'LT_PROG_GO' => 1,
-                        'DX_ENV_APPEND' => 1,
                         'LT_SYS_MODULE_PATH' => 1,
-                        'LT_WITH_LTDL' => 1,
+                        'DX_ENV_APPEND' => 1,
                         'AC_LIBTOOL_POSTDEP_PREDEP' => 1,
-                        'DX_ARG_ABLE' => 1,
-                        'AX_GCC_ARCHFLAG' => 1,
+                        'LT_WITH_LTDL' => 1,
                         'AC_LTDL_SHLIBPATH' => 1,
+                        'AX_GCC_ARCHFLAG' => 1,
+                        'DX_ARG_ABLE' => 1,
                         'AX_OPENMP' => 1,
                         'AM_AUX_DIR_EXPAND' => 1,
-                        'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1,
                         '_LT_AC_LANG_F77_CONFIG' => 1,
-                        '_LT_COMPILER_OPTION' => 1,
+                        'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1,
                         '_AM_SET_OPTIONS' => 1,
-                        'AM_RUN_LOG' => 1,
+                        '_LT_COMPILER_OPTION' => 1,
                         '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
-                        'AC_LIBTOOL_PICMODE' => 1,
-                        'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1,
+                        'AM_RUN_LOG' => 1,
                         'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1,
-                        'ACX_PTHREAD' => 1,
-                        'AC_CHECK_LIBM' => 1,
+                        'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1,
+                        'AC_LIBTOOL_PICMODE' => 1,
                         'LT_PATH_LD' => 1,
+                        'AC_CHECK_LIBM' => 1,
+                        'ACX_PTHREAD' => 1,
                         'AC_LIBTOOL_SYS_LIB_STRIP' => 1,
                         '_AM_MANGLE_OPTION' => 1,
-                        'DX_FEATURE_xml' => 1,
-                        'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1,
                         'AC_LTDL_SYMBOL_USCORE' => 1,
+                        'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1,
+                        'DX_FEATURE_xml' => 1,
                         'AM_SET_DEPDIR' => 1,
                         '_LT_CC_BASENAME' => 1,
                         'DX_FEATURE_ps' => 1,
@@ -280,57 +280,57 @@
                         'configure.in'
                       ],
                       {
-                        'AM_PROG_F77_C_O' => 1,
                         '_LT_AC_TAGCONFIG' => 1,
-                        'm4_pattern_forbid' => 1,
+                        'AM_PROG_F77_C_O' => 1,
                         'AC_INIT' => 1,
-                        'AC_CANONICAL_TARGET' => 1,
+                        'm4_pattern_forbid' => 1,
                         '_AM_COND_IF' => 1,
-                        'AC_CONFIG_LIBOBJ_DIR' => 1,
+                        'AC_CANONICAL_TARGET' => 1,
                         'AC_SUBST' => 1,
-                        'AC_CANONICAL_HOST' => 1,
+                        'AC_CONFIG_LIBOBJ_DIR' => 1,
                         'AC_FC_SRCEXT' => 1,
+                        'AC_CANONICAL_HOST' => 1,
                         'AC_PROG_LIBTOOL' => 1,
                         'AM_INIT_AUTOMAKE' => 1,
-                        'AC_CONFIG_SUBDIRS' => 1,
                         'AM_PATH_GUILE' => 1,
+                        'AC_CONFIG_SUBDIRS' => 1,
                         'AM_AUTOMAKE_VERSION' => 1,
                         'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_CONFIG_LINKS' => 1,
                         'AC_REQUIRE_AUX_FILE' => 1,
-                        'LT_SUPPORTED_TAG' => 1,
+                        'AC_CONFIG_LINKS' => 1,
                         'm4_sinclude' => 1,
+                        'LT_SUPPORTED_TAG' => 1,
                         'AM_MAINTAINER_MODE' => 1,
                         'AM_NLS' => 1,
                         'AC_FC_PP_DEFINE' => 1,
                         'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
-                        '_m4_warn' => 1,
                         'AM_MAKEFILE_INCLUDE' => 1,
+                        '_m4_warn' => 1,
                         'AM_PROG_CXX_C_O' => 1,
-                        '_AM_MAKEFILE_INCLUDE' => 1,
                         '_AM_COND_ENDIF' => 1,
+                        '_AM_MAKEFILE_INCLUDE' => 1,
                         'AM_ENABLE_MULTILIB' => 1,
                         'AM_SILENT_RULES' => 1,
                         'AM_PROG_MOC' => 1,
                         'AC_CONFIG_FILES' => 1,
-                        'LT_INIT' => 1,
                         'include' => 1,
-                        'AM_GNU_GETTEXT' => 1,
+                        'LT_INIT' => 1,
                         'AM_PROG_AR' => 1,
+                        'AM_GNU_GETTEXT' => 1,
                         'AC_LIBSOURCE' => 1,
-                        'AC_CANONICAL_BUILD' => 1,
                         'AM_PROG_FC_C_O' => 1,
+                        'AC_CANONICAL_BUILD' => 1,
                         'AC_FC_FREEFORM' => 1,
-                        'AC_FC_PP_SRCEXT' => 1,
                         'AH_OUTPUT' => 1,
-                        'AC_CONFIG_AUX_DIR' => 1,
+                        'AC_FC_PP_SRCEXT' => 1,
                         '_AM_SUBST_NOTMAKE' => 1,
-                        'm4_pattern_allow' => 1,
-                        'AM_PROG_CC_C_O' => 1,
+                        'AC_CONFIG_AUX_DIR' => 1,
                         'sinclude' => 1,
-                        'AM_CONDITIONAL' => 1,
-                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_PROG_CC_C_O' => 1,
+                        'm4_pattern_allow' => 1,
                         'AM_XGETTEXT_OPTION' => 1,
+                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_CONDITIONAL' => 1,
                         'AC_CONFIG_HEADERS' => 1,
                         'AC_DEFINE_TRACE_LITERAL' => 1,
                         'AM_POT_TOOLS' => 1,
@@ -351,57 +351,57 @@
                         'configure.in'
                       ],
                       {
-                        '_LT_AC_TAGCONFIG' => 1,
                         'AM_PROG_F77_C_O' => 1,
-                        'AC_INIT' => 1,
+                        '_LT_AC_TAGCONFIG' => 1,
                         'm4_pattern_forbid' => 1,
-                        'AC_CANONICAL_TARGET' => 1,
+                        'AC_INIT' => 1,
                         '_AM_COND_IF' => 1,
-                        'AC_CONFIG_LIBOBJ_DIR' => 1,
+                        'AC_CANONICAL_TARGET' => 1,
                         'AC_SUBST' => 1,
-                        'AC_CANONICAL_HOST' => 1,
+                        'AC_CONFIG_LIBOBJ_DIR' => 1,
                         'AC_FC_SRCEXT' => 1,
+                        'AC_CANONICAL_HOST' => 1,
                         'AC_PROG_LIBTOOL' => 1,
                         'AM_INIT_AUTOMAKE' => 1,
-                        'AC_CONFIG_SUBDIRS' => 1,
                         'AM_PATH_GUILE' => 1,
+                        'AC_CONFIG_SUBDIRS' => 1,
                         'AM_AUTOMAKE_VERSION' => 1,
                         'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_REQUIRE_AUX_FILE' => 1,
                         'AC_CONFIG_LINKS' => 1,
-                        'LT_SUPPORTED_TAG' => 1,
+                        'AC_REQUIRE_AUX_FILE' => 1,
                         'm4_sinclude' => 1,
+                        'LT_SUPPORTED_TAG' => 1,
                         'AM_MAINTAINER_MODE' => 1,
                         'AM_NLS' => 1,
                         'AC_FC_PP_DEFINE' => 1,
                         'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
-                        '_m4_warn' => 1,
                         'AM_MAKEFILE_INCLUDE' => 1,
+                        '_m4_warn' => 1,
                         'AM_PROG_CXX_C_O' => 1,
-                        '_AM_COND_ENDIF' => 1,
                         '_AM_MAKEFILE_INCLUDE' => 1,
+                        '_AM_COND_ENDIF' => 1,
                         'AM_ENABLE_MULTILIB' => 1,
                         'AM_PROG_MOC' => 1,
                         'AM_SILENT_RULES' => 1,
                         'AC_CONFIG_FILES' => 1,
-                        'include' => 1,
                         'LT_INIT' => 1,
-                        'AM_PROG_AR' => 1,
+                        'include' => 1,
                         'AM_GNU_GETTEXT' => 1,
+                        'AM_PROG_AR' => 1,
                         'AC_LIBSOURCE' => 1,
-                        'AC_CANONICAL_BUILD' => 1,
                         'AM_PROG_FC_C_O' => 1,
+                        'AC_CANONICAL_BUILD' => 1,
                         'AC_FC_FREEFORM' => 1,
-                        'AC_FC_PP_SRCEXT' => 1,
                         'AH_OUTPUT' => 1,
-                        '_AM_SUBST_NOTMAKE' => 1,
+                        'AC_FC_PP_SRCEXT' => 1,
                         'AC_CONFIG_AUX_DIR' => 1,
-                        'AM_PROG_CC_C_O' => 1,
-                        'sinclude' => 1,
+                        '_AM_SUBST_NOTMAKE' => 1,
                         'm4_pattern_allow' => 1,
-                        'AM_CONDITIONAL' => 1,
-                        'AC_CANONICAL_SYSTEM' => 1,
+                        'sinclude' => 1,
+                        'AM_PROG_CC_C_O' => 1,
                         'AM_XGETTEXT_OPTION' => 1,
+                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_CONDITIONAL' => 1,
                         'AC_CONFIG_HEADERS' => 1,
                         'AC_DEFINE_TRACE_LITERAL' => 1,
                         'AM_POT_TOOLS' => 1,
diff --git a/examples/test_qr.c b/examples/test_qr.c
index 6f77203afbba2c5b3e1c0a953eb9b89323609187..ada33a33acf4833ed478e10bcd0f0cadf466a5e5 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -28,283 +28,472 @@
 #include <unistd.h>
 #include <math.h>
 #include <omp.h>
+#include <pthread.h>
+
+
+
 
-/* LAPACKE header. */
-#include <lapacke.h>
 #include <cblas.h>
 
 
 /* Local includes. */
 #include "quicksched.h"
 
-/*
- * Sam's routines for the tiled QR decomposition.
- */
-
-/*
-  \brief Computes 2-norm of a vector \f$x\f$
-  
-  Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f]
+/**
+ * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR.
  */
-float do2norm(float* x, int l)
+double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum)
 {
-	float sum = 0, norm;
-	int i;
+    double* Q = malloc(sizeof(double)*size*size);
+    double* Qtemp = malloc(sizeof(double)*size*size);
+    double* w = malloc(sizeof(double)*size);
+    double* ww = malloc(sizeof(double)*size*size);
+    double* temp = malloc(sizeof(double)*size*size);
+    int i, k, l, j, n;
+    bzero(Q, sizeof(double)*size*size);
+    bzero(Qtemp, sizeof(double)*size*size);
+    bzero(ww, sizeof(double)*size*size);
+    for(i = 0; i < size; i++)
+    {
+        Q[i*size + i] = 1.0;
+    }
+    int numcoltile = size / tilesize;
+    int numrowtile = size / tilesize;
+    for(k = 0; k < numrowtile; k++)
+    {
+        for(l = 0; l < tilesize; l++)
+        {
+            bzero(Qtemp, sizeof(double)*size*size);
+            for(i = 0; i < size; i++)
+            {
+                Qtemp[i*size + i] = 1.0;
+            }
+
 
-	for(i = 0; i < l; i++)
-		sum += x[i] * x[i];
+            for(i = k; i < numcoltile; i++)
+            {
+                bzero(w, sizeof(double)*size);
+                
+                for(j = 0 ; j < tilesize; j++)
+                {
+                    w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j];
+                }
+                    w[k*tilesize+l] = 1.0;
+                if(k*tilesize+l > i*tilesize)
+                {
+                    for(j = 0; j < k*tilesize+l; j++)
+                        w[j] = 0.0;
+                }
 
-	norm = sqrt(sum);
 
-	return norm;
+                /* Compute (I - tau*w*w')' */
+                for(j = 0; j < size; j++)
+                {
+                    for(n = 0; n < size; n++)
+                    {
+                        if(j != n)
+                            ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
+                        else
+                            ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
+                    }
+                }
+
+                /* Qtemp = Qtemp * (I-tau*w*w')' */
+                cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size,  size, 1.0, Qtemp, size, ww, size, 0.0, temp, size);
+                double *b = Qtemp;
+                Qtemp = temp;
+                temp = b;
+            }   
+            cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size);
+            double *b = Q;
+            Q = temp;
+            temp = b;         
+        }
+    }
+    
+    free(Qtemp);
+    free(w);
+    free(ww);
+    free(temp);
+    return Q;
 }
 
-/**
- * \brief Computes a Householder reflector from a pair of vectors from coupled blocks
- *
- * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks.
- * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag
- *
- * \param topDiag The only non-zero element of the incoming vector in the top block
- * \param ma The number of elements in the top vector
- * \param xb Pointer to the lower vector
- * \param l The number of elements in the whole vector
- * \param vk A pointer to a pre-allocated array to store the householder vector of size l
- *
- * \returns void
- */
-void calcvkfloat	(float topDiag,
-			int ma,
-			float* xb,
-			int l,
-			float* vk)
+double* getR(double*HR, int size)
 {
-	int sign, i;
-	float norm, div;
-	//same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable
-
-	sign = topDiag >= 0.0 ? 1 : -1;
-	vk[0] = topDiag;
-	//use vk[0] as beta
-	for(i = 1; i < ma; i++)
-		vk[i] = 0;
-
-	for(; i < l; i++)
-		vk[i] = xb[i - ma];
-
-	norm = do2norm(vk, l);
-	vk[0] += norm * sign;
-
-	if(norm != 0.0)
-	{
-		div = 1/vk[0];
-	
-		for(i = 1; i < l; i++)
-			vk[i] *= div;
-	}
+    double* R = malloc(sizeof(double) * size * size);
+    int i, j;
+    bzero(R, sizeof(double) * size * size);
+    for(i = 0; i< size; i++)
+    {
+        for(j = 0; j <= i; j++)
+        {
+            R[i*size + j] = HR[i*size + j];
+        }
+    }
+    return R;
 }
 
-
-void updatefloatQ_WY	(float* blockA,
-			float* blockB,
-			float* blockTau,
-			int k, int ma, int mb, int n,
-			int ldm,
-			float* hhVector)//bottom, essential part.
+void printMatrix(double* Matrix, int m, int n, int tilesize)
 {
-	int i, j;
-
-	float tau = 1.0, beta;
-
-	/* Compute tau = 2/v'v */
-	for(i = 0; i < mb; i ++)
-		tau += hhVector[i] * hhVector[i];
-
-	tau = 2/tau;
+    int i, j;
+    
+    for(i=0; i < m*tilesize; i++)
+    {
+        for(j = 0; j < n*tilesize; j++)
+        {
+            printf(" %.3f ", Matrix[j*m*tilesize +i]);
 
-	for(j = k; j < n; j ++)
-	{
-		/* Compute v'*b_j */
-		beta = blockA[(j*ldm) + k];
+        }
+        printf("\n");
+    }
 
-		/* Then for lower half */
-		for(i = 0; i < mb; i ++)
-			beta += blockB[(j*ldm) + i] * hhVector[i];
+}
 
-		beta *= tau;
+double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize)
+{
+    double* TileMatrix;
+    TileMatrix = malloc(sizeof(double) * size );
+    if(TileMatrix == NULL)
+        error("failed to allocate TileMatrix");
+    int i,j,k,l;
 
-		/* Compute b_j = b_j - beta*v_k */
-		blockA[(j*ldm) + k] -= beta;
-		
-		for(i = 0; i < mb; i ++)
-			blockB[(j*ldm) + i] -= beta * hhVector[i];
-	}
+    for( i = 0; i < n ; i++ )
+    {
+        for( j = 0; j < m; j++ )
+        {
+            double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize];
+            double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
+            for(k = 0; k < tilesize; k++)
+            {
+                tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize];
+                for( l = 0; l < tilesize; l++ )
+                {
+                    tilePos[k*tilesize + l] = tileStart[l];
+                }
+            }
+        }
+    }
 
-	/* Insert vector below diagonal. */
-	for(i = 0; i < mb; i ++)
-		blockB[(k*ldm) + i] = hhVector[i];
+    return TileMatrix;
 
-	blockTau[k] = tau;
 }
 
-void DTSQRF	(float* blockA,
-		float* blockB,
-		float* blockTau,
-		int ma,
-		int mb,
-		int n,
-		int ldm,
-		float* hhVector)
+double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize)
 {
-	int k;
-	float* xVectA, *xVectB;
-	
-	xVectA = blockA;
-	xVectB = blockB;
-
-	for(k = 0; k < n; k++)
-	{
-		//vk = sign(x[1])||x||_2e1 + x
-		//vk = vk/vk[0]
-		calcvkfloat(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential
-
-		//matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na)
-		//update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros.
-		updatefloatQ_WY	(blockA, blockB,
-					blockTau,
-					k, ma, mb, n,
-					ldm,
-					hhVector + ma - k);
-
-		xVectA += ldm + 1;
-		xVectB += ldm;
-	}
+    double* ColumnMatrix;
+    ColumnMatrix = (double*) malloc(sizeof(double) * size );
+    if(ColumnMatrix == NULL)
+        error("failed to allocate ColumnMatrix");
+    int i,j,k,l;
+    for( i = 0; i < n ; i++ )
+    {
+        for(j = 0; j < m ; j++ )
+        {
+            /* Tile on ith column is at i*m*32*32.*/
+            /* Tile on jth is at j*32*32 */
+            double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
+            /* Column starts at same position as tile. */
+            /* Row j*32.*/
+            double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize];
+            for( k = 0; k < tilesize; k++ )
+            {
+                for(l=0; l < tilesize; l++)
+                {
+                    tilePos[l] = tile[l];
+                }
+                /* Next 32 elements are the position of the tile in the next column.*/
+                tile = &tile[tilesize];
+                /* Move to the j*32th position in the next column. */                
+                tilePos = &tilePos[tilesize*m];
+
+            }
+        }   
+    }
+    return ColumnMatrix;
 }
 
-void DSSRFT	(float* blockV,
-		float* blockA, float* blockB,
-		float* blockTau,
-		int b, int n, int ldm)
+/* Routines for the tiled QR decomposition.*/
+
+/**
+ *
+ * @brief Computes the QR decomposition of a tile.
+ *
+ * @param cornerTile A pointer to the tile for which the decomposition is computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param k The value of k for the QR decomposition
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w)
 {
-	int i, j, k;
+    int i, j, n;
+    double norm=0.0, sign, u1, tau, z;
 
-	float tau, beta;
 
-	/* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B,
-	   and for each householder vector v of blockV */
+    /*Find the householder vector for each row. */
+    for(i = 0; i < tileSize; i++)
+    {
+        norm = 0.0;
+        /*Fill w with the vector.*/
+        for(j=i; j < tileSize; j++)
+        {
+            /* ith row is i*tileSize, only want elements on diagonal or below.*/
+            w[ j ] = cornerTile[ i*tileSize +j ];
+            /*Find the norm as well */
+            norm = norm + w[j]*w[j];
+        }
+        if(w[i] >= 0.0)
+            sign = -1;
+        else
+            sign = 1;
 
-	/* For each column of B */
-	for(j = 0; j < n; j ++)
-	{
-		/* For each householder vector. */
-		for(k = 0; k < n; k ++)
-		{
-			/* tau = 2/v'v, computed earlier, stored in T(k,k). */
-			tau = blockTau[k];
+        norm = sqrt(norm);
 
-			/* Compute beta = v_k'b_j. */
-			/* v_k is >0 (=1) only at position k in top half. */
-			beta = blockA[(j*ldm) + k];
+        u1 = w[i] - sign*norm;
 
-			/* For lower portion of v_k, aligning with the lower block */
-			for(i = 0; i < b; i ++)
-				beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i];
+        if(u1 != 0.0)
+        {
+            for(j=i+1; j < tileSize; j++)
+                w[j] = w[j] / u1;
+        }
+        else
+        {
+            for(j=i+1; j < tileSize; j++)
+                w[j] = 0.0;
+        }
 
-			beta *= tau;
-			
-			/* Compute b_j = b_j - beta * v */
-			/* v_k = 1 at (k) in top half again */
-			blockA[(j*ldm) + k] -= beta;
-
-			/* Apply to bottom block. */
-			for(i = 0; i < b; i ++)
-				blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i];
-		}
-	}
-}
+        if(norm != 0.0)
+            tau = -sign*u1/norm;
+        else
+            tau = 0.0;
 
-float* randomMatrix(int m, int n)
-{
-    float* Matrix;
-    Matrix = (float*) malloc( sizeof(float) * m*n*32*32);
-    if(Matrix == NULL)
-        error("Failed to allocate Matrix");
-    int r,c;
-    m = m*32;
-    n = n*32;
-for(c = 0; c < n; c++)
-	{
-		for(r = 0; r < m; r++)
-		{
-            //CO(i,j,m) ((m * j) + i)
-				Matrix[(m*c)+r] = ((float)(rand() % 201) - 100.0) / 100.0;
-		}
-	}
-return Matrix;
+        /*Store the below diagonal vector */
+        
+        for(j = i+1; j < tileSize; j++)
+        {
+            cornerTile[ i*tileSize +j ] = w[j];
+        }
+        cornerTile[ i*tileSize + i ] = sign*norm;
+        w[i] = 1.0;
+        /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
+        for(j = i+1; j < tileSize; j++)
+        {
+            /*Compute w'*A_j*/
+            z = cornerTile[ j*tileSize+i];
+            for(n = i+1; n < tileSize; n++)
+            {
+                z = z + cornerTile[j*tileSize+n] * w[n];
+            }
+            /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+            for(n=i; n < tileSize; n++)
+            {
+                cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z;
+            }
+
+        }
+        /* Store tau. We're on k*tileSize+ith row. kth column.*/
+        tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
+
+    }
 
 }
-			
-float* generateMatrix( int m, int n)
+
+
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile for which the householder is stored.
+ * @param rowTiles The tile to which the householder is applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w)
 {
-    float* Matrix;
-    Matrix = (float*) malloc( sizeof(float) * m*n*32*32);
-    if(Matrix == NULL)
-        error("Failed to allocate Matrix");
-    int i, j;
-    memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
+    int i, j, n;
+    double z=0.0;
+
 
-    for(i = 0; i < n*32; i++)
+    /* For each row in the corner Tile*/
+    for(i = 0; i < tileSize; i++)
     {
-        for(j = 0; j < m*32; j++)
+        /*Get w for row i */
+        for(j = i; j < tileSize; j++)
         {
-                Matrix[i*m*32 + j] = (float)(i+j);
+            w[j] = cornerTile[i*tileSize + j];
         }
+        w[i] = 1.0;
+        
+        /* Apply to the row Tile */
+        for(j = 0; j < tileSize; j++)
+        {
+            z=0.0;
+            /* Below Diagonal!*/
+            /*Compute w'*A_j*/
+            for(n = i; n < tileSize; n++)
+            {
+                z = z + w[n] * rowTile[j*tileSize+n];
+            }
+            for(n = i; n < tileSize; n++)
+            {
+                rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
+            }
+        }
+
+        
     }
-    return Matrix;
+
+
 }
 
-float* createIdentity(int m, int n)
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile The corner tile for this value of k.
+ * @param columnTile The tile for which householders are computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w )
 {
-    float* Matrix;
-    Matrix = (float*) malloc( sizeof(float) * m*n*32*32);
-    if(Matrix == NULL)
-        error("Failed to allocate Matrix");
-    int i, j;
-    memset ( Matrix, 0, sizeof(float)*m*n*32*32 );
+    int i, j, n;
+    double norm=0.0, sign, u1, tau, z;
 
-    for(i = 0; i < n*32; i++)
+    /* For each column compute the householder vector. */
+    for(i = 0; i < tilesize; i++)
     {
-        for(j = 0; j < m*32; j++)
+        norm = 0.0;
+        w[i] = cornerTile[ i*tilesize+i ];
+        norm = norm + w[i]*w[i];
+        for(j = i+1; j < tilesize; j++)
+        {   
+            w[j] = 0.0;
+        }
+        for(j = 0; j < tilesize; j++)
+        {
+            w[tilesize+j] = columnTile[ i*tilesize+j ];
+            norm = norm + w[tilesize+j]*w[tilesize+j];
+        }
+        
+        norm = sqrt(norm);
+        if(w[i] >= 0.0)
+            sign = -1;
+        else
+            sign = 1;
+        
+
+        u1 = w[i] - sign*norm;
+        if(u1 != 0.0)
+        {
+            for(j = i+1; j < 2*tilesize; j++){
+                w[j] = w[j]/u1;
+            }
+        }else
         {
-            if(i==j)
+            for(j = i+1; j < 2*tilesize; j++)
+                w[j] = 0.0;
+        }
+
+        if(norm != 0)
+            tau = -sign*u1/norm;
+        else
+            tau = 0.0;
+
+        /* Apply to each row to the right.*/
+        for(j = i; j < tilesize; j++)
+        {
+            /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+            z = 1.0 * cornerTile[ j*tilesize+i ];            
+            for(n = 0; n < tilesize; n++)
             {
-                Matrix[i*m*32 + j] = 1.0;
-            }else
+                z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
+            }
+            /* Apply to upper tile.*/
+            cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
+            for(n = i+1; n < tilesize; n++)
+            {
+                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
+            }
+            /* Apply to lower tile.*/
+            for(n = 0; n < tilesize; n++)
             {
-                Matrix[i*m*32 + j] = 0.0;
+                columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
             }
+            
+        }
+        /* Store w*/
+        for(j = 0; j < tilesize; j++){
+            columnTile[ i*tilesize+j ] = w[tilesize+j];
         }
+        tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
     }
-    return Matrix;
 }
 
-void printMatrix(float* Matrix, int m, int n)
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile to have the householder applied.
+ * @param columnTile The tile in which the householders are stored.
+ * @param rowTile The upper tile to have the householders applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w)
 {
-    int i, j;
+    int i, j, n;
+    double z;
+
     
-    for(i=0; i < m*32; i++)
+    for(i = 0; i < tilesize; i++)
     {
-        printf("{ ");
-        for(j = 0; j < n*32; j++)
+        for(j = 0; j < i; j++)
+            w[j] = 0.0;
+        w[i] = 1.0;
+        for(j = i+1; j < tilesize; j++)
+            w[j] = 0.0;
+        for(j = 0; j < tilesize; j++)
+            w[j+tilesize] = columnTile[i*tilesize +j];
+        
+        /* Apply householder vector (w) to the tiles.*/
+        for(j = 0; j < tilesize; j++)
         {
-            printf(" %.3f ", Matrix[j*m*32 +i]);
-
+            z = 0.0;
+            /* Compute w' * A_j */
+            for(n = 0; n < tilesize; n++)
+            {
+                z += w[n] * rowTile[j*tilesize+n];
+                z += w[n + tilesize] * cornerTile[j*tilesize+n];
+            }
+            for(n = 0; n < tilesize; n++)
+            {
+                rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
+                cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
+            }
         }
-        printf(" }");
-        printf("\n");
     }
 
-}
+}		
 			
 /**
  * @brief Computed a tiled QR factorization using QuickSched.
@@ -314,10 +503,10 @@ void printMatrix(float* Matrix, int m, int n)
  * @param nr_threads Number of threads to use.
  */
  
-void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
+void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) {
 
     int k, j, i;
-    float *A, *A_orig, *tau;
+    double *A, *A_orig, *tau;
     struct qsched s;
     qsched_task_t *tid, tid_new;
     qsched_res_t *rid;
@@ -332,28 +521,22 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
     
         /* Decode the task data. */
         int *idata = (int *)data;
-        int i = idata[0], j = idata[1];//, k = idata[2];
-        float buff[ 2*K*K ];
+        int i = idata[0], j = idata[1], k = idata[2];
+        double ww[ 2*K ];
         
         /* Decode and execute the task. */
         switch ( type ) {
             case task_DGEQRF:
-                LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR , K, K ,
-                                &A[ j*m*K*K + i*K ] , m*K , &tau[ j*m*K + i*K ] ,
-                                buff , 2*K*K );
-                
+                DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww);
                 break;
             case task_DLARFT:
-                LAPACKE_slarft_work( LAPACK_COL_MAJOR , 'F' , 'C' ,
-                                K , K , &A[ i*m*K*K + i*K ] ,
-                                m*K , &tau[ i*m*K + i*K ] , &A[ j*m*K*K + i*K ] ,
-                                m*K );
+                DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww);
                 break;
             case task_DTSQRF:
-                //DTSQRF( &A[ j*m*K*K + j*K ] , &A[ j*m*K*K + i*K ] , &tau[ j*m*K + i*K ] , K , K , K , K*m , buff );
+                DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww);
                 break;
             case task_DSSRFT:
-                //DSSRFT(	&A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m );
+                DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww);
                 break;
             default:
                 error( "Unknown task type." );
@@ -363,28 +546,20 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
         
     
     /* Allocate and fill the original matrix. */
-    if ( ( A = (float *)malloc( sizeof(float) * m * n * K * K ) ) == NULL ||
-         ( tau = (float *)malloc( sizeof(float) * m * n * K ) ) == NULL ||
-         ( A_orig = (float *)malloc( sizeof(float) * m * n * K * K ) ) == NULL )
+    if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ||
+         ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL ||
+         ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
         error( "Failed to allocate matrices." );
-    free(A_orig);
-//    for ( k = 0 ; k < m * n * K * K ; k++ )
-  //      A_orig[k] = 2*((float)rand()) / RAND_MAX - 1.0;
-//    A_orig = generateMatrix(m, n);
-	srand(5);
-    A_orig = randomMatrix(m,n);
-    printMatrix(A_orig, m, n);
-    printf("\n\n\n");
-    memcpy( A , A_orig , sizeof(float) * m * n * K * K );
-    bzero( tau , sizeof(float) * m * n * K );
-
-    LAPACKE_sgeqrf( LAPACK_COL_MAJOR , K*m, K*n ,
-                                A, m*K , tau);
+    for ( k = 0 ; k < m * n * K * K ; k++ )
+    {
+        if(matrix == NULL)
+            A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
+        else
+            A_orig[k] = matrix[k];
+    }
+    memcpy( A , A_orig , sizeof(double) * m * n * K * K );
+    bzero( tau , sizeof(double) * m * n * K );
     
-    printMatrix(A, m , n );
-    printf("\n\n\n");
-    memcpy( A , A_orig , sizeof(float) * m * n * K * K );
-    bzero( tau , sizeof(float) * m * n * K );
     /* Dump A_orig. */
     /* message( "A_orig = [" );
     for ( k = 0 ; k < m*K ; k++ ) {
@@ -404,7 +579,7 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
         error( "Failed to allocate tid/rid matrix." );
     for ( k = 0 ; k < m * n ; k++ ) {
         tid[k] = qsched_task_none;
-        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , NULL, 0 );
+        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none );
         }
     
     /* Build the tasks. */
@@ -451,9 +626,10 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
                 qsched_adduse( &s , tid_new , rid[ k*m + i ] );
                 qsched_adduse( &s , tid_new , rid[ j*m + k ] );
                 qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
+                qsched_addunlock( &s, tid[j*m+i-1], tid_new);
                 if ( tid[ j*m + i ] != -1 )
                     qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
+
                 tid[ j*m + i ] = tid_new;
                 }
         
@@ -481,13 +657,7 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
         tot_run += toc_run - tic;
         
         }
-        printf("tau = ");
-        for(k = 0; k < m * n * K ; k++)
-        {
-            printf("%.3f ", tau[k]);
-        }
-        printf("\n");
-        printMatrix(A, m, n);
+    
         
     /* Dump A. */
     /* message( "A = [" );
@@ -521,12 +691,60 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
     for ( k = 0 ; k < qsched_timer_count ; k++ )
         message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
     
+    if(matrix != NULL)
+    {
+        for(k = 0; k < m*n*K*K; k++)
+            matrix[k] = A[k];
+    }
+
+ 
+    /* Test if the decomposition was correct.*/
+   /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+    double *Q = computeQ(tempMatrix, m*K, K, tau, m);
+    double *R = getR(tempMatrix, m*K);
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K);
+        free(Q);
+        Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+    for(i = 0; i < m * n * K * K; i++)
+    {
+        if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995))
+            printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]);
+    }
+    free(tempMatrix);
+    free(Q);
+    free(R);*/
+
     /* Clean up. */
+    free(A);
+    free(A_orig);
+    free(tau);
+    free(tid);
+    free(rid);
     qsched_free( &s );
         
     }
 
 
+/* Generates a random matrix. */
+double* generateColumnMatrix(int size)
+{
+    double* matrix = malloc(sizeof(double)*size*size);
+    if(matrix == NULL)
+        error("Failed to allocate matrix");
+
+    unsigned long int m_z = 35532;
+    int i;
+    for(i = 0 ; i < size*size; i++)
+    {
+        m_z = (1664525*m_z + 1013904223) % 4294967296;
+        matrix[i] = m_z % 100;
+        if(matrix[i] < 0)
+            matrix[i] += 100;
+    }
+    return matrix;
+}
+
+
 /**
  * @brief Main function.
  */
@@ -578,7 +796,7 @@ int main ( int argc , char *argv[] ) {
     message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
         32*M , 32*N , nr_threads , runs );
         
-    test_qr( M , N , K , nr_threads , runs );
+    test_qr( M , N , K , nr_threads , runs , NULL);
     
     }
     
diff --git a/src/Makefile.am b/src/Makefile.am
index a6771251acbfdfec5fe69035c2f15742d72a24d9..651c009cffdeb821933117f0d24230cc35aec2d7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -21,7 +21,7 @@ AUTOMAKE_OPTIONS=gnu
 # Add the debug flag to the whole thing
 AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \
     -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) -DTIMERS \
-    # -fsanitize=address -fno-omit-frame-pointer
+     #-fsanitize=address -fno-omit-frame-pointer
 
 # Assign a "safe" version number
 AM_LDFLAGS = -version-info 0:0:0
@@ -31,12 +31,12 @@ lib_LTLIBRARIES = libquicksched.la
 libquicksched_la_SOURCES = qsched.c queue.c
 
 # List required headers
-include_HEADERS = atomic.h lock.h queue.h qsched.h task.h res.h error.h
+include_HEADERS = atomic.h lock.h queue.h qsched.h task.h res.h error.h qsched.h
 
 # CUDA sources 
 if HAVE_CUDA
     SOURCES_CUDA = qsched.c queue.c cuda_queue.cu
-    CUDA_MYFLAGS = -O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx --maxrregcount=32 -Xptxas="-v" -Xptxas -dlcm=cg -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -DWITH_CUDA
+    CUDA_MYFLAGS = -O3 -g -DCPU_TPS=3.1e9 -Xnvlink -rdc=true -lineinfo -src-in-ptx --maxrregcount=32 -Xptxas="-v" -Xptxas -dlcm=cg -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -DWITH_CUDA #-fsanitize=address -fno-omit-frame-pointer
 .cu: qsched.c queue.c
 .cu.o:
 	$(NVCC) -c $(NVCCFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< -o $@
@@ -50,5 +50,5 @@ endif
 if HAVE_CUDA
 lib_LTLIBRARIES += libquicksched_cuda.la
 libquicksched_cuda_la_SOURCES = $(SOURCES_CUDA)
-libquicksched_cuda_la_CFLAGS = -DFPTYPE_SINGLE $(AM_CFLAGS) $(CUDA_CFLAGS)
+libquicksched_cuda_la_CFLAGS = -DFPTYPE_SINGLE $(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS)
 endif
diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu
index 0b50bb1b715420f6384a6752e24a9b5620b9db78..6e35f8390e5f8f9cfa1f858e980587393b7555a8 100644
--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
@@ -24,13 +24,16 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "cuda_queue.h"
+extern "C"{
 #include "quicksched.h"
+}
+#include "cuda_queue.h"
 #include "res.h"
 
 /*Define task types required for GPU */
 #define type_load -100
 #define type_unload -101
+#define type_ghost -102
 
 
 /*Define if conflicts are enabled or not.*/
@@ -40,19 +43,64 @@
 __device__ struct queue_cuda cuda_queues[ cuda_numqueues ];
 __constant__ int cuda_nrqueues;
 __constant__ int cuda_queue_size;
+__constant__ int cuda_numtasks;
 
 __device__ struct task *tasks_cuda;
 __device__ qsched_res_t *locks_cuda;
 __device__ qsched_res_t *uses_cuda;
 __device__ qsched_task_t *deps_cuda;
 __device__ struct res *res_cuda;
+__device__ char *data_cuda;
+__device__ int cuda_barrier = 0;
 
+__device__ qsched_funtype fun;
 /**
  * @brief Get a task ID from the given queue.
  *
  */
+
+
+
+#ifdef GPU_locks
+__device__ __inline__ int cuda_trylock ( volatile int *l ) {
+    int res = atomicCAS( (int *)l, 0 , 1 );
+    return res;
+}
+
+__device__ __inline__ int cuda_lock ( volatile int *l ) {
+    while(atomicCAS( (int *)l, 0 , 1 ) != 0 );
+    return 0;
+}
+
+__device__ __inline__ int cuda_unlock ( volatile int *l ) {
+    int res = atomicCAS( (int *)l , 1 , 0) != 1 ;
+    return res;
+}   
+#endif
+
+/**
+ * @brief Copy bulk memory in a strided way.
+ *
+ * @param dest Pointer to destination memory.
+ * @param source Pointer to source memory.
+ * @param count Number of bytes to copy, must be a multiple of sizeof(int).
+ */
+ 
+__device__ __inline__ void cuda_memcpy_tasks ( void *dest , void *source , int count , int tid ) {
+
+    int k;
+    int *idest = (int *)dest, *isource = (int *)source;
+    
+    
+    /* Copy the data in chunks of sizeof(int). */
+    for ( k = threadIdx.x ; k < count/sizeof(int) ; k += blockDim.x ){
+        idest[k] = isource[k];
+//        idest[k] *= tid;
+        }
+        
+    }
  
-__device__ int cuda_queue_gettask ( struct queue_cuda *q ) {
+__device__ int cuda_queue_gettask ( struct queue_cuda *q, int wait ) {
 
     int ind, tid = -1;
     
@@ -66,7 +114,7 @@ __device__ int cuda_queue_gettask ( struct queue_cuda *q ) {
     /* Wrap the index. */
     ind %= cuda_queue_size; 
     /* Loop until there is a valid task at that index. */
-    while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0 );
+    while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0 && wait);
     
     /* Scratch the task from the queue */
     if ( tid >= 0 )
@@ -105,20 +153,190 @@ __device__ void cuda_queue_puttask ( struct queue_cuda *q , int tid ) {
 #ifdef GPU_locks
 
 
-//TODO
+/**
+ * @brief Unlock a resource and un-hold its parents.
+ *
+ * @param s Pointer to the #qsched.
+ * @param rid The ID of the resource to lock.
+ */
+ 
+__device__ void cuda_unlockres ( int rid ) {
+
+    int finger;
+        
+    /* Unlock the resource. */
+    cuda_unlock( &res_cuda[rid].lock );
+
+    /* Go back up the tree and undo the holds. */
+    for ( finger = res_cuda[rid].parent ; finger != qsched_res_none ; finger = res_cuda[finger].parent )
+        atomicAdd( (int *) &res_cuda[finger].hold, -1 );
+            
+    }
+
+
+/**
+ * @brief Lock a resource and hold its parents.
+ *
+ * @param rid The ID of the resource to lock.
+ *
+ * @return @c 1 if the resource could be locked, @c 0 otherwise.
+ */
+ 
+__device__ int cuda_lockres ( int rid ) {
+
+    int finger, finger2;
+    
+    /* Try to lock the root-level resource. */
+    if ( res_cuda[rid].hold || cuda_trylock( &res_cuda[rid].lock ) )
+        return 0;
+        
+    /* Did the resource get held in the meantime? */
+    if ( res_cuda[rid].hold ) {
+        cuda_unlock( &res_cuda[rid].lock );
+        return 0;
+        }
+        
+    /* Follow parents and increase their hold counter, but fail
+       if any are locked. */
+    for ( finger = res_cuda[rid].parent ; finger != qsched_res_none ; finger = res_cuda[finger].parent ) {
+        if ( cuda_trylock( &res_cuda[finger].lock ) )
+            break;
+        atomicAdd((int *) &res_cuda[finger].hold , 1);
+        cuda_unlock( &res_cuda[finger].lock );
+        }
+        
+    /* Did we fail on the way up? */
+    if ( finger != qsched_res_none ) {
+    
+        /* Unlock the resource. */
+        cuda_unlock( &res_cuda[rid].lock );
+    
+        /* Go back up the tree and undo the holds. */
+        for ( finger2 = res_cuda[rid].parent ; finger2 != finger ; finger2 = res_cuda[finger2].parent )
+            atomicAdd((int *) &res_cuda[finger2].hold, -1 );
+            
+        /* Fail. */
+        return 0;
+    
+        }
+        
+    /* Otherwise, all went well. */
+ 
+    else
+        return 1;
+    }
+
+
+/**
+ * @brief Try to get all the locks for a task.
+ * 
+ * @param tid The ID of the #task to lock.
+ *
+ * @return @c 1 if the resources could be locked, @c 0 otherwise.
+ */
+__device__ int cuda_locktask ( int tid ) {
+
 
+    int k;
+    struct task *t;
 
+    /* Get a pointer on the task. */
+    t = &tasks_cuda[tid];
+        
+    /* Try to lock all the task's locks. */
+    for ( k = 0 ; k < t->nr_locks ; k++ )
+        if ( cuda_lockres( t->locks[k] ) == 0 )
+            break;
+
+    /* If I didn't get all the locks... */
+    if ( k < t->nr_locks ) {
+        /* Unroll the locks I got. */
+        for ( k -= 1 ; k >= 0 ; k-- )
+            cuda_unlockres( t->locks[k] );
 
+        /* Fail. */
+        return 0;
 
+        }
+        
+    /* Otherwise, all went well. */
+    else {
+        return 1;
+        }
 
+}
 
+//TODO
 
+/**
+ * @brief Tell the #qsched that a task has completed.
+ *
+ * @param s Pointer to the #qsched.
+ * @param t Pointer to the completed #task.
+ */
+ 
+__device__ void cuda_done ( struct task *t ) {
 
+    int k;
+    struct task *t2;
+        
+    
+    /* Release this task's locks. */
+    for ( k = 0 ; k < t->nr_locks ; k++ )
+        cuda_unlockres( t->locks[k] );
+    
+    /* Loop over the task's unlocks... */
+    //for ( k = 0 ; k < t->nr_unlocks ; k++ ) {
+    
+        /* Get a grip on the unlocked task. */
+      //  t2 = &tasks_cuda[ t->unlocks[k] ];
+
+        /* Is the unlocked task ready to run? */
+        //if ( atomicAdd( &t2->wait, -1 ) == 1 && !( t2->flags & task_flag_skip ) )
+          //      cuda_queue_puttask( &cuda_queues[0] , t->unlocks[k] );
+            
+        //}
+        
+    /* Set the task stats. */
+    //t->toc = getticks();
+    //t->cost = t->toc - t->tic;
 
+    }
 
+/**
+ * @brief Get a task from the given task queue.
+ *
+ * Picks tasks from the queue sequentially and checks if they
+ * can be computed. If not, they are returned to the queue.
+ *
+ * This version of the routine does not check for conflicts.
+ *
+ * This routine blocks until a valid task is picked up, or the
+ * specified queue is empty.
+ */
+__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) {
 
+    int tid = -1;
+    
+    
+    /* Main loop. */
+    while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) {
+        if( cuda_locktask(tid) == 1 )
+            break;    
 
+        cuda_queue_puttask ( q , tid );
+        }
+        
+    /* Put this task into the recycling queue, if needed. */
+    if ( tid >= 0 ) {
+            q->rec_data[ atomicAdd( (int *)&q->rec_count , 1 ) ] = tid;
+        }
+        
+        
+    /* Return whatever we got. */
+    return tid;
 
+    }
 
 
 
@@ -135,13 +353,13 @@ __device__ void cuda_queue_puttask ( struct queue_cuda *q , int tid ) {
  * This routine blocks until a valid task is picked up, or the
  * specified queue is empty.
  */
-__device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
+__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) {
 
     int tid = -1;
     
     
     /* Main loop. */
-    while ( ( tid = cuda_queue_gettask( q ) ) >= 0 ) {
+    while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) {
         break;    
         }
         
@@ -159,13 +377,387 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) {
 
 #endif
 
+__global__ void qsched_device_kernel ( )
+{
+    volatile __shared__ int tid;
+    int *src, *dest;
+    int i;
+
+
+    /* Pull a task from the queues*/
+    while ( 1 ) {
+        __syncthreads();
+        if ( threadIdx.x == 0 ) {
+             tid = -1;
+             if( cuda_queues[0].count == cuda_queues[0].rec_count )
+                tid = runner_cuda_gettask ( &cuda_queues[1], 1);
+             else
+                tid = runner_cuda_gettask ( &cuda_queues[1], 0);
+
+             if( tid < 0 )
+                tid = runner_cuda_gettask ( &cuda_queues[0], 1);
+            }
+            
+        /*Everyone wait for us to get a task id*/
+        __syncthreads();
+            
+        /* Exit if we didn't get a valid task. */
+        if(tid < 0)
+            break;
+
+
+        /*Start the task clock*/
+        if( threadIdx.x == 0 ) 
+            tasks_cuda[tid].tic = clock();
+
+        if( tasks_cuda[tid].type == type_load )
+        {
+        int *d = (int*)&data_cuda[tasks_cuda[tid].data];
+            src = (int*)res_cuda[d[0]].data;
+            dest = (int*)res_cuda[d[0]].gpu_data;
+            cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid);
+        }else if( tasks_cuda[tid].type == type_unload )
+        {
+        int *d = (int*)&data_cuda[tasks_cuda[tid].data];
+            src = (int*)res_cuda[d[0]].gpu_data;
+            dest = (int*)res_cuda[d[0]].data;
+            cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, d[0]);
+        }else{
+           fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]);
+        }
+        __syncthreads();
+        /*Stop the task clock*/
+        if( threadIdx.x == 0 ) 
+            tasks_cuda[tid].toc = clock();
+
+        /*Unlocks*/
+    #ifdef GPU_locks
+        if(threadIdx.x == 0)
+            cuda_done( &tasks_cuda[tid] );
+       // __syncthreads();
+    #endif
+        for(i = threadIdx.x; i < tasks_cuda[tid].nr_unlocks; i += blockDim.x )
+        {
+            if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip ))
+            {
+                cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] );
+            }
+        }
+    }
+
+
+/* Make a notch on the barrier, last one out cleans up the mess... */
+    __syncthreads();
+    if ( threadIdx.x == 0 )
+        tid = ( atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 );
+    __syncthreads();
+    if ( tid  ) {
+        if ( threadIdx.x == 0 ) {
+            cuda_barrier = 0;
+            volatile int *temp = cuda_queues[1].data; cuda_queues[1].data = cuda_queues[1].rec_data; cuda_queues[1].rec_data = temp;
+            
+            cuda_queues[0].first = 0;
+            cuda_queues[0].last = 0;
+            cuda_queues[0].rec_count = 0;
+            cuda_queues[1].first = 0;
+            cuda_queues[1].last = 0;
+            cuda_queues[1].rec_count = 0;
+
+            }
+
+//TODO
+      /*  for ( int j = threadIdx.x ; j < cuda_nr_tasks ; j+= blockDim.x )
+            for ( k = 0 ; k < tasks_cuda[j].nr_unlock ; k++ )
+            {
+                atomicAdd( (int *) &cuda_tasks[ cuda_tasks[j].unlock[k] ].wait , 1 );
+            }*/
+
+        
+        }
+
+}
 
+int maxVal( int *array, int size )
+{
+   int i, maxi=-32000;
+   for (i=0; i<size; i++)
+   {
+	 if (array[i]>maxi)
+	 {
+	    maxi=array[i];
+	 }
+   }
+   return maxi ;
+}
 
+int minVal( int *array, int size )
+{
+   int i, maxi=3200000;
+   for (i=0; i<size; i++)
+   {
+	 if (array[i]<maxi)
+	 {
+	    maxi=array[i];
+	 }
+   }
+   return maxi ;
+}
 
+void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted )
+{
+    int i,j;
+    if(numChildren > 0 && size/numChildren > 128*sizeof(int))
+    {
+        /* Create dummy task for this resource and recurse to its children!*/
+        int task, utask;
+        task = qsched_addtask( s, type_ghost, task_flag_none, NULL, 0 , 0 );
+        s->res[ID].task = task;
+        utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 );
+        s->res[ID].task = utask;
+        if(parent >= 0)
+        {
+            /* Create dependecy to parent. */ 
+            qsched_addunlock(s, task, s->res[parent].task );
+            qsched_addunlock(s, s->res[parent].utask, utask );
+        }
+        for(i = sorted[ID]; i < sorted[ID+1]; i++)
+        {
+            qsched_create_loads(s, i, s->res[res[i]].size, sorted[i+1]-sorted[i], ID, res, sorted);
+        }
+    }else{
+    int task,utask;
+        task = qsched_addtask( s , type_load , task_flag_none, &ID, sizeof(int), 0 );
+        s->res[ID].task = task;
+        utask = qsched_addtask( s , type_unload, task_flag_none, &ID, sizeof(int), 0 );
+        s->res[ID].task = utask;
+        /* Create load task for this resource and set its children to completed with this task.*/
+        for( j = sorted[ID]; j < sorted[ID+1]; j++ )
+        {
+            s->res[res[j]].task = task;
+        }
+        /* If it has a parent then set the parents ghost task to be dependent on this.*/
+        if(parent >= 0)
+        {
+            qsched_addunlock(s, task, s->res[parent].task );
+            qsched_addunlock(s, s->res[parent].utask, utask);
+        }
+    }
+}
 
 void qsched_prepare_loads ( struct qsched *s ) {
 
-int i, task, unload, tb, j, k, unlocked=0;
+int i, task, unload, j , k , unlocked = 0;
+struct task *t;
+int *sorted, lastindex;
+int *res, *res_data;
+
+if(s->res[0].task != -1)
+{
+    printf("Tasks already initialised, not redoing load/unload tasks");
+    return;
+}
+
+/* Store number of children for each resource*/
+sorted = (int*) malloc(sizeof(int) * (s->count_res+1));
+res = (int*) malloc(sizeof(int) * s->count_res);
+res_data = (int*) malloc(sizeof(int)* s->count_res);
+memset( sorted, 0, sizeof(int) * (s->count_res+1));
+/* Count the children for each parent. Stored in sorted[parent +1]*/
+for(i = 0; i < s->count_res; i++)
+{
+    if(s->res[i].parent != qsched_res_none)
+    {
+        sorted[s->res[i].parent+1]++;
+    }
+}
+
+/* Run through the sorted array and turn count to indices. Id 0 starts at 0, Id 1 starts at 0+sorted[1] (number of children for resource 0)*/
+sorted[0] = 0;
+for(i = 1; i < s->count_res+1; i++ )
+{
+    sorted[i] = sorted[i] + sorted[i-1];
+}
+lastindex = sorted[s->count_res];
+
+for(i = 0; i < s->count_res; i++)
+{
+    int parent = s->res[i].parent;
+    if( parent == qsched_res_none )
+    {
+        res[lastindex] = i;
+        lastindex++;
+    }else{
+        res[sorted[parent]] = i;
+        sorted[parent]++;
+    }
+}
+
+/*Initialise the memory address array, already sorted by parents.*/
+int mini=0;
+for(i = 0; i < s->count_res; i++)
+{
+    res_data[i] = ((char *)s->res[res[i]].data) - (char*)s->res[0].data;
+    if(res_data[i] < mini)
+        mini = res_data[i]; 
+//    printf("%i ", res_data[i]);
+}
+for(i = 0; i < s->count_res; i++)
+{
+    res_data[i] -= mini;
+printf("%i ", res_data[i]);
+}
+printf("\n");
+
+/* Sort the children of each parent by memory address. */
+qsched_sort(res, res_data, sorted[0], minVal(res_data, sorted[0]), maxVal(res_data, sorted[0]));
+for(i = 1; i < s->count_res; i++)
+{
+    if(sorted[i] > sorted[i-1]){
+        qsched_sort(&res[sorted[i-1]], &res_data[sorted[i-1]], sorted[i]-sorted[i-1], minVal(&res_data[sorted[i-1]], sorted[i]-sorted[i-1]), maxVal(&res_data[sorted[i-1]], sorted[i]-sorted[i-1]));
+    }
+}
+
+/* Sort super resources by memory address.*/
+qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]));
+
+
+/*for(i = 0; i < s->count_res; i++)
+{
+    printf("%i ", res_data[i]);
+}
+printf("\n");
+
+
+for(i = 0; i < s->count_res; i++)
+{
+printf("%i ", res[i]);
+}
+printf("\n");
+
+for(i = 0; i < s->count_res; i++)
+{
+    printf("%i ", s->res[res[i]].parent);   
+}
+printf("\n");*/
+/*res now contains an array of indices, first sorted by parent, then memory address of data. */
+int size=0;
+if(sorted[0] != 0)
+{
+    /* Check no overlapping resources.*/
+    for(i = 0; i < sorted[0]-1; i++)
+    {
+        if(res_data[i] + s->res[res[i]].size > res_data[i+1])
+            error("Overlapping resources are not allowed.");
+    }
+}
+
+for(i = 1; i < s->count_res; i++)
+{
+    if(sorted[i] > sorted[i-1])
+    {
+        for(j = sorted[i-1]; j < sorted[i]-1; j++)
+        {
+            if(res_data[j] + s->res[res[j]].size > res_data[j+1])
+                error("Overlapping resources are not allowed.");
+        }
+    }
+}
+
+/* Check super resources don't overlap.*/
+for( i = sorted[s->count_res-1]; i < s->count_res; i++ )
+{
+    if(res_data[i] + s->res[res[i]].size > res_data[i+1])
+        error("Overlapping resources are not allowed.");
+}
+
+/* Reposition sorted pointers so that sorted[i] points to the first child of task ID= i*/
+for(i = sorted[s->count_res]; i >= 0; i-- )
+{
+    sorted[i] = sorted[i-1];
+}
+/* If nothing overlaps create tasks.*/
+for( i = sorted[s->count_res]; i < s->count_res; i++ )
+{
+    /* Start from each parentless task.*/
+    int ID = res[i];
+    int size = s->res[ID].size;
+    int numChildren = sorted[ID+1] - sorted[ID];
+    int parent = -1;
+
+    qsched_create_loads(s, ID, size, numChildren, parent, res, sorted);
+
+}
+
+/* Check all tasks have load tasks - if not give parents (recursively)*/
+for(i = 0; i < s->count_res; i++ )
+{
+    if( s->res[i].task == -1 )
+    {
+        struct res *t;
+        for(t = &s->res[s->res[i].parent]; s->res[i].task == -1; t = &s->res[t->parent])
+        {
+            s->res[i].task = t->task;
+            if((t->parent == qsched_res_none) && s->res[i].task == -1)
+                error("Somehow load task wasn't initialised");
+        }
+    }
+
+    if( s->res[i].utask == -1 )
+    {
+        struct res *t;
+        for(t = &s->res[s->res[i].parent]; s->res[i].utask == -1; t = &s->res[t->parent])
+        {
+            s->res[i].utask = t->utask;
+            if((t->parent == qsched_res_none) && s->res[i].utask == -1)
+                error("Somehow unload task wasn't initialised");
+        }
+    }
+}
+
+
+
+/* Set up dependencies with the rest of the system.*/
+for( i = 0; i < s->count_res; i++ )
+{
+    int unlocked = 0;
+    for( j = 0; j < s->count; j++ )
+    {
+        struct task *t = &s->tasks[j];
+        for(k = 0; k < t->nr_uses; k++)
+        {
+            if(t->uses[k] == i)
+            {
+                qsched_addunlock(s, s->res[i].task ,j);
+                qsched_addunlock(s, j, s->res[i].utask);
+                unlocked = 1;
+                break;
+            }
+        }
+        if(unlocked == 1)
+        {
+            unlocked = 0;
+            continue;
+        }
+        for(k = 0; k < t->nr_locks; k++)
+        {
+            if(t->locks[k] == i)
+            {
+                qsched_addunlock(s, s->res[i].task ,j);
+                qsched_addunlock(s, j, s->res[i].utask);
+                break;
+            }
+        }
+    }
+}
+//error("Got to here");
+}
+
+
+
+
+void qsched_prepare_loads_old ( struct qsched *s ) {
+
+int i, task, unload, j, k, unlocked=0;
 struct task *t;
 /* Create a load task for each resource. */
 for(i = 0; i < s->count_res; i++)
@@ -173,17 +765,19 @@ for(i = 0; i < s->count_res; i++)
     
     if(s->res[i].size == 0)
         continue;
-
-
-
-
-    task = qsched_addtask( s , type_load , 0 , &i , sizeof(int) , 0 );
-    unload = qsched_addtask( s , type_unload, 0 , &i, sizeof(int), 0 );
+    if(s->res[i].task >= 0)
+        continue;
+//    cudaMalloc( &s->res[ i ].gpu_data, s->res[i].size );
+    task = qsched_addtask( s , type_load , task_flag_none , &i , sizeof(int) , 0 );
+    s->res[i].task = task;
+    printf("s->res[i].task = %i\n", s->res[i].task);
+    unload = qsched_addtask( s , type_unload, task_flag_none , &i, sizeof(int), 0 );
     /*Load task unlocks each task that uses or locks the specified resource */
+    /*Unload task is unlocked by each task that is unlocked by the load task. */
     for(j = 0; j < s->count; j++)
     {
           t = &s->tasks[j];
-        
+
         for(k = 0; k < t->nr_uses; k++)
         {
             if(t->uses[k] == i){
@@ -208,8 +802,8 @@ for(i = 0; i < s->count_res; i++)
 
         }
     }
-    
-    
+     //   qsched_adduse(s, task , i);
+       // qsched_adduse(s, unload , i);
 }
 
 
@@ -221,97 +815,405 @@ for(i = 0; i < s->count_res; i++)
 
 
 
-int qsched_prepare_cuda ( struct qsched *s ) {
+
+extern "C" void qsched_prepare_cuda ( struct qsched *s ) {
 
 int i;
-struct task *cuda_t;
-qsched_res_t *setup;
+int j, k, count;
+struct task *t, *tasks;
+struct task *cuda_t, *task, *temp;
+qsched_res_t *setup_l, *setup_u;
 qsched_task_t *setup_t;
 struct res *res_t;
 int *data;
+char *sdata;
 
-/*Setup Load and unload tasks*/
+/* Lock the sched. */
+    //lock_lock( &s->lock );
 
+    /* Get a pointer to the tasks, set the count. */
+    tasks = s->tasks;
+    count = s->count;
+    
+    /* If the sched is dirty... */
+    if ( s->flags & qsched_flag_dirty ) {
+    
+        /* Do the sorts in parallel, if possible. */
+        #pragma omp parallel
+        {
+    
+            /* Sort the unlocks. */
+            #pragma omp single nowait
+            qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 );
+
+            /* Sort the locks. */
+            #pragma omp single nowait
+            qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
+
+            /* Sort the uses. */
+            #pragma omp single nowait
+            qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 );
+            
+        }
+        
+        /* Run throught the tasks and link the locks and unlocks. */
+        tasks[0].unlocks = s->deps;
+        tasks[0].locks = s->locks;
+        tasks[0].uses = s->uses;
+        for ( k = 1 ; k < count ; k++ ) {
+            tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ];
+            tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ];
+            tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ];
+            }
+        
+        /* All cleaned-up now! */
+        //s->flags &= ~qsched_flag_dirty;
+    
+        }
 
+qsched_prepare_loads(s);
 
 
+    
 
+    
 
+    
+    /* Get a pointer to the tasks, set the count. */
+    tasks = s->tasks;
+    count = s->count;
+    
+    /* If the sched is dirty... */
+    if ( s->flags & qsched_flag_dirty ) {
+    
+        /* Do the sorts in parallel, if possible. */
+        #pragma omp parallel
+        {
+    
+            /* Sort the unlocks. */
+            #pragma omp single nowait
+            qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 );
+
+            /* Sort the locks. */
+            #pragma omp single nowait
+            qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 );
+
+            /* Sort the uses. */
+            #pragma omp single nowait
+            qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 );
+            
+        }
+        
+        /* Run throught the tasks and link the locks and unlocks. */
+        tasks[0].unlocks = s->deps;
+        tasks[0].locks = s->locks;
+        tasks[0].uses = s->uses;
+        for ( k = 1 ; k < count ; k++ ) {
+            tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ];
+            tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ];
+            tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ];
+            }
+        
+        /* All cleaned-up now! */
+        s->flags &= ~qsched_flag_dirty;
+    
+        }
+        
+    /* Init the queues. */
+    for ( k = 0 ; k < s->nr_queues ; k++ )
+        queue_init( &s->queues[k] , count );
 
+    /* Reset the waits to 0... */
+    for( k = 0; k < count; k++ )
+    {
+        tasks[k].wait = 0;
+    }    
+
+    /* Run through the tasks and set the waits... */
+    for ( k = 0 ; k < count ; k++ ) {
+        t = &tasks[k];
+        if ( !( t->flags & task_flag_skip ) )
+            for ( j = 0 ; j < t->nr_unlocks ; j++ )
+                tasks[ t->unlocks[j] ].wait += 1;
+        }
 
+        
+    /* Sort the tasks topologically. */
+    int *tid = (int *)malloc( sizeof(int) * count );
+    for ( j = 0 , k = 0 ; k < count ; k++ )
+        if ( tasks[k].wait == 0 ) {
+            tid[j] = k;
+            j += 1;
+            }
+    for ( k = 0 ; k < j ; k++ ) {
+        t = &tasks[ tid[k] ];
+        for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ )
+            if ( ( tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) {
+                tid[j] = t->unlocks[kk];
+                j += 1;
+                }
+        }
+    if ( k < count )
+        error( "Circular dependencies detected." );
+        
+    /* Run through the topologically sorted tasks backwards and
+       set their weights, re-setting the waits while we're at it. */
+    for ( k = count-1 ; k >= 0 ; k-- ) {
+        int maxweight = 0;
+        t = &tasks[ tid[k] ];
+        for ( j = 0 ; j < t->nr_unlocks ; j++ ) {
+            tasks[ t->unlocks[j] ].wait += 1;
+            if ( tasks[ t->unlocks[j] ].weight > maxweight )
+                maxweight = tasks[ t->unlocks[j] ].weight;
+            }
+        t->weight = t->cost + maxweight;
+        }
 
 
+/*Allocate temporary tasks to setup device tasks*/
+temp = (struct task *) malloc(s->count * sizeof(struct task));
+if(temp == NULL)
+    error("Failed to allocate the temporary task store.");
 
+memcpy(temp, s->tasks, s->count * sizeof(struct task));
 
 /*Copy the qsched data to the device*/
-if( cudaMalloc( &cuda_t , sizeof(struct task) * s->count ) != cudaSuccess )
-    error("Failed to allocate task array on the device.");
+if(cudaMalloc( &sdata , s->count_data ) != cudaSuccess )
+    error("Failed to allocate the qsched data on the device");
+if(cudaMemcpy( sdata , s->data , s->count_data , cudaMemcpyHostToDevice ) != cudaSuccess )
+    error("Failed to copy the qsched data to the device.");
+if(cudaMemcpyToSymbol( data_cuda , &sdata ,  sizeof(char *) , 0 ,cudaMemcpyHostToDevice ) != cudaSuccess )
+    error("Failed to copy data pointer to the device.");    
 
-if( cudaMemcpy( cuda_t, s->tasks, sizeof(struct task) * s->count , cudaMemcpyHostToDevice ) != cudaSuccess )
-    error("Failed to copy tasks to the device.");
-if( cudaMemcpyToSymbol ( cuda_t , tasks_cuda, sizeof(struct task *) ,  cudaMemcpyHostToDevice ) != cudaSuccess )
-    error("Failed to copy task pointer to the device.");
 
-if( cudaMalloc( &setup , sizeof(qsched_res_t) * s->count_locks ) != cudaSuccess )
+/*Copy the task data to the device*/
+
+if(s->count_locks > 0 )
+{
+if( cudaMalloc( &setup_l , sizeof(qsched_res_t) * s->count_locks ) != cudaSuccess )
     error("Failed to allocate locks array on the device.");
-if( cudaMemcpy( &setup , s->locks , sizeof(qsched_res_t) * s->count_locks, cudaMemcpyHostToDevice ) != cudaSuccess )
+if( cudaMemcpy( setup_l , s->locks , sizeof(qsched_res_t) * s->count_locks, cudaMemcpyHostToDevice ) != cudaSuccess )
     error("Failed to copy locks to the device.");
-if( cudaMemcpyToSymbol ( setup , locks_cuda, sizeof(qsched_res_t *),  cudaMemcpyHostToDevice ) != cudaSuccess )
+if( cudaMemcpyToSymbol ( locks_cuda, &setup_l , sizeof(qsched_res_t *), 0 ,  cudaMemcpyHostToDevice ) != cudaSuccess )
     error("Failed to copy locks pointer to the device.");
+for(i = 0; i < s->count; i++ )
+{
+    task = &temp[i];
+    task->locks = setup_l + (task->locks - s->locks);
+}
+}
 
 if( cudaMalloc( &setup_t , sizeof(qsched_task_t) * s->count_deps ) != cudaSuccess )
-    error("Failed to allocate deps array on the device.");
-if( cudaMemcpy( &setup_t , s->deps , sizeof(qsched_task_t) * s->count_deps, cudaMemcpyHostToDevice ) != cudaSuccess )
-    error("Failed to copy deps to the device.");
-if( cudaMemcpyToSymbol ( &setup_t , deps_cuda, sizeof(qsched_task_t *) ,  cudaMemcpyHostToDevice ) != cudaSuccess )
+    error("Failed to allocate deps array on the device: %s", cudaGetErrorString(cudaPeekAtLastError()));
+if( cudaMemcpy( setup_t , s->deps , sizeof(qsched_task_t) * s->count_deps, cudaMemcpyHostToDevice ) != cudaSuccess )
+    error("Failed to copy deps to the device: %s", cudaGetErrorString(cudaPeekAtLastError()));
+if( cudaMemcpyToSymbol ( deps_cuda, &setup_t , sizeof(qsched_task_t *) , 0 ,  cudaMemcpyHostToDevice ) != cudaSuccess )
     error("Failed to copy deps pointer to the device.");
 
-if( cudaMalloc( &uses_cuda , sizeof(qsched_res_t) * s->count_uses ) != cudaSuccess )
+
+
+if( cudaMalloc( &setup_u , sizeof(qsched_res_t) * s->count_uses ) != cudaSuccess )
     error("Failed to allocate use array on the device.");
-if( cudaMemcpy( &setup , s->uses , sizeof(qsched_res_t) * s->count_uses, cudaMemcpyHostToDevice ) != cudaSuccess )
+if( cudaMemcpy( setup_u , s->uses , sizeof(qsched_res_t) * s->count_uses, cudaMemcpyHostToDevice ) != cudaSuccess )
     error("Failed to copy locks to the device.");
-if( cudaMemcpyToSymbol ( setup , uses_cuda, sizeof(qsched_res_t *),  cudaMemcpyHostToDevice ) != cudaSuccess )
+if( cudaMemcpyToSymbol ( uses_cuda, &setup_u ,  sizeof(qsched_res_t *), 0 ,  cudaMemcpyHostToDevice ) != cudaSuccess )
     error("Failed to copy locks pointer to the device.");
 
+
 if( cudaMalloc( &res_t , sizeof(struct res) * s->count_res ) != cudaSuccess )
     error("Failed to allocated on the device.");
 
 for(i = 0; i < s->count_res; i++)
 {   
-    if(s->res[i].size == 0)
-        continue;
-    if( cudaMalloc( &data, sizeof(int) * s->res[i].size) != cudaSuccess )
-        error("Failed to allocate data space on the device.");
-    s->res[i].gpu_data = data;
+ //   if(s->res[i].size == 0)
+   //     continue;
+    //if( cudaMalloc( &data, s->res[i].size) != cudaSuccess )
+      //  error("Failed to allocate data space on the device.");
+    //s->res[i].gpu_data = data;
 }
 
 
-if( cudaMemcpy( &res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess )
-    error("Failed to copy resources to the device.");
-if( cudaMemcpyToSymbol( res_t , res_cuda , sizeof(struct res *) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess )
-    error("Failed to copy res pointer to the device.");
-
-
+if( cudaMemcpy( res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess )
+    error("Failed to copy resources to the device: %s", cudaGetErrorString(cudaPeekAtLastError()));
+if( cudaMemcpyToSymbol( res_cuda , &res_t , sizeof(struct res *) , 0 , cudaMemcpyHostToDevice) != cudaSuccess )
+    error("Failed to copy res pointer to the device: %s", cudaGetErrorString(cudaPeekAtLastError()));
 
+for(i = 0; i < s->count; i++ )
+{
+    task = &temp[i];
+    task->unlocks = setup_t + (task->unlocks - s->deps);
+}
+for(i = 0; i < s->count; i++ )
+{
+    task = &temp[i];
+    task->uses = setup_u + (task->uses - s->uses);
+}
 
+if( cudaMalloc( &cuda_t , sizeof(struct task) * s->count ) != cudaSuccess )
+    error("Failed to allocate task array on the device.");
+if( cudaMemcpy( cuda_t, temp, sizeof(struct task) * s->count , cudaMemcpyHostToDevice ) != cudaSuccess )
+    error("Failed to copy tasks to the device.");
+if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t ,  sizeof(struct task *) , 0 ,  cudaMemcpyHostToDevice ) != cudaSuccess )
+    error("Failed to copy task pointer to the device.");
 
 
 
 
+/*for(i = 0; i < s->count; i++)
+{
+    printf("Task:%i of type %i has wait %i\n", i, temp[i].type, temp[i].wait);
+}*/
 
 
 
 
 
 /* Initialize the queues. */
-int nr_queues= 2, k, qsize;
+int nr_queues= 2,qsize;
 //int *data;
 struct queue_cuda queues[ cuda_numqueues ];
 
+ qsize = max(2*s->count / nr_queues, 256);
+    if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
+            error("Failed to copy queue size to the device.");
+            
+    /* Allocate a temporary buffer for the queue data. */
+        if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL )
+            error("Failed to allocate data buffer.");
+
+    queues[1].count = 0;
+    for(i = 0; i < s->count; i++)
+    {
+        if(temp[i].type == type_load && temp[i].wait == 0)
+            data[queues[1].count++] = i;
+    
+    }
+     for ( k = queues[1].count ; k < qsize ; k++ )
+        data[k] = -1;  
+    /* Allocate and copy the data. */
+    if ( cudaMalloc( &queues[1].data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[1].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+
+    for ( k = 0; k < qsize; k++ )
+            data[k] = -1;
+            
+    /* Allocate and copy the recyling data. */
+    if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[1].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+    /* Allocate and copy the recyling data. */
+    if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+        
+    /* Set some other values. */
+    queues[1].first = 0;
+    queues[1].last = queues[1].count;
+    queues[1].rec_count = 0;  
+
+    /* Init queue 0*/
+    queues[0].count = 0;
+    for ( k = 0; k < s->count ; k++ )
+    {
+        if(temp[k].type != type_load && temp[k].wait == 0){
+            data[queues[0].count++] = k;
+    }
+
+    }
+    queues[0].first = 0;
+    queues[0].last = queues[0].count;
+    queues[0].count = s->count - queues[1].count;
+    queues[0].rec_count = 0;    
+
+
+    /* Allocate and copy the data. */
+    if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess )
+        error("Failed to allocate queue data on the device.");
+    if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess )
+        error("Failed to copy queue data pointer to the device");
+
+       /* Copy the queue structures to the device. */
+        if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess )
+            error("Failed to copy the queues to the device");
 
+    /* Clean up. */
+    free( tid );
+    printf("Queue[0].count = %i , queue[1].count = %i\n", queues[0].count, queues[1].count);
+        
+    /* Set the number of waiting tasks. */
+    s->waiting = count;
+        
+    /* Set the ready flag. */
+    s->flags |= qsched_flag_ready;
+
+    free( temp );
+    free( data );
+    /* Unlock the sched. */
+    //lock_unlock_blind( &s->lock );
+    
+}
 
+struct task* qsched_get_timers( struct qsched *s, int numtasks )
+{
+    struct task *gpu_tasks = NULL, *cuda_tasks;
+    
+    cuda_tasks = (struct task*)malloc(numtasks * sizeof(struct task));
+    if(cuda_tasks == NULL)
+        error("Failed to allocate cuda_tasks");
+
+    if( cudaMemcpyFromSymbol(&gpu_tasks, tasks_cuda, sizeof(struct task*), 0 ) != cudaSuccess )
+        error("Failed to get symbol from device");
+
+    if( cudaMemcpy( cuda_tasks, gpu_tasks, sizeof(struct task) * numtasks, cudaMemcpyDeviceToHost) != cudaSuccess )
+        error("Failed to copy tasks from device to host");
 
 
+    return cuda_tasks;
 }
 
+/*void qsched_get_CUDA_tasks ( struct qsched *s , struct task *cuda_tasks , int numtasks )
+{
+
+}*/
+
+/**
+ * @brief Execute all the tasks in the current scheduler using
+ *        OpenMP.
+ *
+ * @param s Pointer to the #qsched.
+ * @param fun User-supplied function that will be called with the
+ *        task type and a pointer to the task data. This must be a __device__ function!
+ *
+ * This function is only available if QuickSched was compiled with
+ * OpenMP support.
+ */
+void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) {
+
+#ifdef WITH_CUDA
+    double itpms = 1000.0 / CPU_TPS;
+    ticks tic, toc_run ;
+    tic = getticks();
+    qsched_prepare_cuda( s );
+  toc_run = getticks(); 
+	message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+    cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype));
+    tic = getticks();
+    qsched_device_kernel<<<1, 128 >>> (  );
+    if( cudaDeviceSynchronize() != cudaSuccess )
+        error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError()));
+  toc_run = getticks(); 
+	message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms );
+
+    // TODO Unload timers (optional).
+
+#else
+        error("QuickSched was not compiled with CUDA support.");
+
+#endif
+
+}
 
diff --git a/src/cuda_queue.h b/src/cuda_queue.h
index a546786ba406fc17ef584e50c4fe44ae19f8737a..27f41c5f32674b8bdbc1c6249b44ccb60b39e161 100644
--- a/src/cuda_queue.h
+++ b/src/cuda_queue.h
@@ -16,9 +16,10 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * 
  ******************************************************************************/
-#define cuda_max_unlock 128
 #define cuda_numqueues 2
 
+/** Type definition for the execution function in #qsched_run. */
+typedef void (*qsched_funtype)( int , void * );
 
 /** Struct for a task queue. */
 struct queue_cuda {
@@ -39,3 +40,8 @@ struct queue_cuda {
     volatile int *rec_data;
 
     };
+
+/*Define function prototypes*/
+void qsched_run_CUDA ( struct qsched *s, qsched_funtype fun );
+extern "C" void qsched_prepare_cuda ( struct qsched *s );
+struct task* qsched_get_timers( struct qsched *s, int numtasks );
diff --git a/src/lock.h b/src/lock.h
index f814a09e378b570fc606ff24535cc21adb60a5c3..5c14abfe76e7bac02f6e15512e7276b01ccd5fe3 100644
--- a/src/lock.h
+++ b/src/lock.h
@@ -27,28 +27,11 @@
 #endif
 
 #ifdef WITH_CUDA
-#undef INLINE
+#define static
 #endif
 
-
-#ifdef WITH_CUDA
-    #define lock_type volatile int
-    #define lock_init( l ) ( *l = 0 )
-    #define lock_destroy( l ) 0
-    __device__ inline static int lock_lock ( volatile int *l ) {
-        while( atomicCAS( (int *)l , 0 , 1 ) != 0 );
-        return 0;
-        }
-    __device__ inline static int lock_trylock ( volatile int *l ) {
-        int res = atomicCAS( (int *)l , 0 , 1 );
-        return res;
-    }
-    __device__ inline static int lock_unlock ( volatile int *l ) {
-        int res = atomicCAS ( (int *)l , 1 , 0 ) != 1;
-        return res;
-    }
-    #define lock_unlock_blind( l ) atomicCAS( &l , 1 , 0 )
-#elif defined (PTHREAD_LOCK)
+#ifdef PTHREAD_LOCK
+#ifndef WITH_CUDA
     #define lock_type pthread_spinlock_t
     #define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 )
     #define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 )
@@ -69,3 +52,20 @@
     #define lock_unlock( l ) ( __sync_val_compare_and_swap( l , 1 , 0 ) != 1 )
     #define lock_unlock_blind( l ) __sync_val_compare_and_swap( l , 1 , 0 )
 #endif
+#else
+    #define lock_type volatile int
+    #define lock_init( l ) ( *l = 0 )
+    #define lock_destroy( l ) 0
+    INLINE static int lock_lock ( volatile int *l ) {
+        while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 );
+            // while( *l );
+        return 0;
+        }
+    #define lock_trylock( l ) ( ( *(l) ) ? 1 : __sync_val_compare_and_swap( l , 0 , 1 ) )
+    #define lock_unlock( l ) ( __sync_val_compare_and_swap( l , 1 , 0 ) != 1 )
+    #define lock_unlock_blind( l ) __sync_val_compare_and_swap( l , 1 , 0 )
+#endif
+
+#ifdef WITH_CUDA
+#undef static
+#endif
diff --git a/src/qsched.c b/src/qsched.c
index 5bcf1487d8b321ae308f5aa46254e8fdbb9902b6..cbc148424029ca39014c752194fc8a41d766d1dc 100644
--- a/src/qsched.c
+++ b/src/qsched.c
@@ -276,6 +276,8 @@ void qsched_reset ( struct qsched *s ) {
     }
 
 
+
+
 /**
  * @brief Execute all the tasks in the current scheduler using
  *        OpenMP.
@@ -698,7 +700,7 @@ void qsched_unlocktask ( struct qsched *s , int tid ) {
  * will require the #qsched to be re-prepared.
  */
  
-struct task *qsched_gettask ( struct qsched *s , int qid ) {
+struct task* qsched_gettask ( struct qsched *s , int qid ) {
 
     int naq, k, tid, qids[ s->nr_queues ];
     struct task *t;
@@ -785,6 +787,70 @@ struct task *qsched_gettask ( struct qsched *s , int qid ) {
 
     }
 
+/**
+ * @brief Sort the data according to the given indices.
+ *
+ * @param data The data to be sorted
+ * @param ind The indices with respect to which the data are sorted.
+ * @param N The number of entries
+ * @param min Lowest index.
+ * @param max highest index.
+ *
+ * This function calls qsched_quicksort.
+ */
+ 
+void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) {
+    int *new_data;
+    int *new_ind;
+    int i;
+    if(N <= 0)
+        return;
+    new_data = (int*)malloc(sizeof(int) * N);
+    if(new_data == NULL)
+        error("Failed to allocate new_data");
+    new_ind = (int*)malloc(sizeof(int) * N);
+    if(new_ind == NULL)
+        error("Failed to allocate new_ind");
+
+    /*Create buckets of size ? - Ideally <16 elements per bucket. Use max-min / N * 10 ? Should give average of 10 elements per bucket */
+    int bucketsize = 1;
+
+    /* To find bucket do ind-min / b and it goes in that bucket.*/
+    int num_buckets = (max-min) / bucketsize +1;
+    int *bucket_inds = (int*) malloc(sizeof(int) * num_bucket);
+    if(bucket_inds == NULL)
+        error("Failed to allocate bucket_inds");
+    memset(bucket_inds,0, sizeof(int)*num_buckets);
+    for(i = 0; i < N; i++)
+    {
+        bucket_inds[(ind[i]-min)]++;
+    }
+    for(i = 1; i < num_buckets; i++ )
+    {
+        bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
+    }
+    /* bucket_inds[i] contains the starting position for the i+1th bucket*/
+    for(i = num_buckets-1; i >0; i--)
+    {
+        bucket_inds[i] = bucket_inds[i-1];
+    }
+    bucket_inds[0] = 0;
+
+    for(i = 0; i < N; i++)
+    {
+        int z = (ind[i]-min);
+        new_data[bucket_inds[z]] = data[i];
+        new_ind[bucket_inds[z]++] = ind[i];
+    }
+
+    /* Copy data back to data and ind and deallocate everything!*/
+    memcpy(data, new_data, N*sizeof(int));
+    memcpy(ind, new_ind, N*sizeof(int));
+    free(new_data);
+    free(new_ind);
+    free(bucket_inds);
+
+}
 
 /**
  * @brief Sort the data according to the given indices.
@@ -798,7 +864,7 @@ struct task *qsched_gettask ( struct qsched *s , int qid ) {
  * This function calls itself recursively.
  */
  
-void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , int max ) {
+void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min , int max ) {
 
     int pivot = (min + max) / 2;
     int i = 0, j = N-1;
@@ -806,7 +872,7 @@ void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , in
     
     /* If N is small enough, just do insert sort. */
     if ( N < 16 ) {
-    
+        printf("%i\n", N);
         for ( i = 1 ; i < N ; i++ )
             if ( ind[i] < ind[i-1] ) {
                 temp_i = ind[i];
@@ -842,13 +908,13 @@ void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , in
             /* Recurse on the left? */
             if ( j > 0  && pivot > min ) {
                 #pragma omp task untied
-                qsched_sort( data , ind , j+1 , min , pivot );
+                qsched_quicksort( data , ind , j+1 , min , pivot );
                 }
 
             /* Recurse on the right? */
             if ( i < N && pivot+1 < max ) {
                 #pragma omp task untied
-                qsched_sort( &data[i], &ind[i], N-i , pivot+1 , max );
+                qsched_quicksort( &data[i], &ind[i], N-i , pivot+1 , max );
                 }
 
             }
@@ -856,11 +922,11 @@ void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , in
             
             /* Recurse on the left? */
             if ( j > 0  && pivot > min )
-                qsched_sort( data , ind , j+1 , min , pivot );
+                qsched_quicksort( data , ind , j+1 , min , pivot );
 
             /* Recurse on the right? */
             if ( i < N && pivot+1 < max )
-                qsched_sort( &data[i], &ind[i], N-i , pivot+1 , max );
+                qsched_quicksort( &data[i], &ind[i], N-i , pivot+1 , max );
             
             }
             
@@ -885,9 +951,6 @@ void qsched_prepare ( struct qsched *s ) {
     /* Lock the sched. */
     lock_lock( &s->lock );
 
-    #ifdef WITH_CUDA
-    qsched_prepare_loads( s );
-    #endif
     
     /* Get a pointer to the tasks, set the count. */
     tasks = s->tasks;
@@ -1005,9 +1068,11 @@ void qsched_prepare ( struct qsched *s ) {
  * @param size Size of the data in bytes.
  * @return The ID of the new shared resource.
  */
- 
+#ifdef WITH_CUDA
+int qsched_addres ( struct qsched *s , int owner , int parent, void* data, int size , void* gpu_data) {
+#else 
 int qsched_addres ( struct qsched *s , int owner , int parent, void* data, int size ) {
-
+#endif
     struct res *res_new;
     int id;
 
@@ -1047,10 +1112,26 @@ int qsched_addres ( struct qsched *s , int owner , int parent, void* data, int s
     s->res[ id ].data = data;
     s->res[ id ].size = size;
 
+    /* Resources data must be contained by parent*/
+    if( s->res[ id ].parent != qsched_res_none && data != NULL)
+    {
+        char* parentdata = (char*)s->res[ parent ].data;
+        char* childdata = (char*) data;
+        int pos = childdata-parentdata;
+        if(pos < 0 || pos + size > s->res[parent].size)
+        {
+            error("Data for a child resource must be contained in the data of its parent");
+        }
+    }
+
     #ifdef WITH_CUDA
-    cudaMalloc( &s->res[ id ].cuda_data, size );
+    s->res[ id ].gpu_data = gpu_data;
+    s->res[ id ].task = -1;
+    s->res[ id ].utask = -1;
     #endif
 
+
+
         
     /* Unlock the sched. */
     lock_unlock_blind( &s->lock );
diff --git a/src/qsched.h b/src/qsched.h
index a2d18d3818d074a5e74c9c640a85e0503f6fee7e..dfa9cde5f36d53704ee07f0419f082151bb79d43 100644
--- a/src/qsched.h
+++ b/src/qsched.h
@@ -177,7 +177,11 @@ void qsched_enqueue ( struct qsched *s , struct task *t );
 
 /* External functions. */
 void qsched_init ( struct qsched *s , int nr_queues , int flags );
+#ifdef WITH_CUDA
+qsched_res_t qsched_addres ( struct qsched *s , int owner , qsched_res_t parent , void* data, int size, void* gpu_data);
+#else
 qsched_res_t qsched_addres ( struct qsched *s , int owner , qsched_res_t parent , void* data, int size);
+#endif
 void qsched_addlock ( struct qsched *s , qsched_task_t t , qsched_res_t res );
 void qsched_addunlock ( struct qsched *s , qsched_task_t ta , qsched_task_t tb );
 qsched_task_t qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *data , int data_size , int cost );
diff --git a/src/res.h b/src/res.h
index bb2b3043eb974d09193a6dd06b153b111bfe198c..8418b6fa1d89118ae2e079d57d02887087d98adb 100644
--- a/src/res.h
+++ b/src/res.h
@@ -16,7 +16,9 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  * 
  ******************************************************************************/
-
+#ifdef WITH_CUDA
+#define res_no_task -2
+#endif
 
 /* The res data structure. */
 struct res {
@@ -36,12 +38,17 @@ struct res {
     /* Pointer to data on the CPU */
     void* data;
 
-    /*Size of the data associated with this resource */
+    /*Size of the data (in bytes) associated with this resource */
     int size;
 
     #ifdef WITH_CUDA
     /* Pointer to the data on the GPU */
     void* gpu_data;
+
+    /* Index of the load task for this resource, if required.*/
+    int task;
+    /* Index of the unload task for this resouce, if required.*/
+    int utask;
     #endif
 
     };