Updated the GPU stuff. Examples work correctly with autogenerated load tasks...

Updated the GPU stuff. Examples work correctly with autogenerated load tasks and inorder dependency generation.

Updated the GPU stuff. Examples work correctly with autogenerated load tasks...
b3d5fb92 · Aidan Chalk · 422b2756 · b3d5fb92 · b3d5fb92 · b3d5fb92
Commit b3d5fb92 authored 10 years ago by Aidan Chalk
--- a/autom4te.cache/requests
+++ b/autom4te.cache/requests
@@ -68,11 +68,11 @@
                        '_LT_AC_LANG_C_CONFIG' => 1,
                        'AM_PROG_INSTALL_STRIP' => 1,
                        '_m4_warn' => 1,
-                        'AC_LIBTOOL_OBJDIR' => 1,
                        'AX_GCC_X86_CPUID' => 1,
+                        'AC_LIBTOOL_OBJDIR' => 1,
                        'gl_FUNC_ARGZ' => 1,
-                        'AM_SANITY_CHECK' => 1,
                        'LTOBSOLETE_VERSION' => 1,
+                        'AM_SANITY_CHECK' => 1,
                        'AC_LIBTOOL_LANG_GCJ_CONFIG' => 1,
                        'AC_LIBTOOL_PROG_COMPILER_PIC' => 1,
                        'LT_LIB_M' => 1,
@@ -85,21 +85,21 @@
                        'AC_LIBTOOL_GCJ' => 1,
                        'DX_CLEAR_DEPEND' => 1,
                        '_LT_WITH_SYSROOT' => 1,
-                        'LT_FUNC_DLSYM_USCORE' => 1,
                        'LT_SYS_DLOPEN_DEPLIBS' => 1,
-                        '_LT_AC_LANG_F77' => 1,
+                        'LT_FUNC_DLSYM_USCORE' => 1,
                        'AC_LIBTOOL_CONFIG' => 1,
-                        '_AM_SUBST_NOTMAKE' => 1,
+                        '_LT_AC_LANG_F77' => 1,
                        'AC_LTDL_DLLIB' => 1,
+                        '_AM_SUBST_NOTMAKE' => 1,
                        '_AM_AUTOCONF_VERSION' => 1,
                        'AM_DISABLE_SHARED' => 1,
                        '_LT_PROG_ECHO_BACKSLASH' => 1,
                        '_LTDL_SETUP' => 1,
-                        '_LT_AC_LANG_CXX' => 1,
                        'AM_PROG_LIBTOOL' => 1,
-                        'AC_LIB_LTDL' => 1,
-                        '_LT_AC_FILE_LTDLL_C' => 1,
+                        '_LT_AC_LANG_CXX' => 1,
                        'AM_PROG_LD' => 1,
+                        '_LT_AC_FILE_LTDLL_C' => 1,
+                        'AC_LIB_LTDL' => 1,
                        'DX_DOXYGEN_FEATURE' => 1,
                        'AU_DEFUN' => 1,
                        'AC_PROG_NM' => 1,
@@ -113,58 +113,58 @@
                        '_AM_SET_OPTION' => 1,
                        'AC_LTDL_PREOPEN' => 1,
                        '_LT_LINKER_BOILERPLATE' => 1,
-                        'AC_LIBTOOL_LANG_CXX_CONFIG' => 1,
-                        'AC_LIBTOOL_PROG_CC_C_O' => 1,
                        '_LT_PREPARE_SED_QUOTE_VARS' => 1,
+                        'AC_LIBTOOL_PROG_CC_C_O' => 1,
+                        'AC_LIBTOOL_LANG_CXX_CONFIG' => 1,
                        'gl_PREREQ_ARGZ' => 1,
                        'AX_EXT' => 1,
-                        'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
                        'LT_SUPPORTED_TAG' => 1,
-                        'LT_SYS_MODULE_EXT' => 1,
+                        'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
                        'LT_PROG_RC' => 1,
+                        'LT_SYS_MODULE_EXT' => 1,
                        'AC_DEFUN_ONCE' => 1,
                        'AX_CHECK_COMPILE_FLAG' => 1,
                        'DX_XML_FEATURE' => 1,
                        '_LT_AC_LANG_GCJ' => 1,
                        'AC_LTDL_OBJDIR' => 1,
-                        '_LT_PATH_TOOL_PREFIX' => 1,
                        'DX_TEST_FEATURE' => 1,
+                        '_LT_PATH_TOOL_PREFIX' => 1,
                        'AC_LIBTOOL_RC' => 1,
-                        'AM_SILENT_RULES' => 1,
-                        'AC_DISABLE_FAST_INSTALL' => 1,
                        '_LT_AC_PROG_ECHO_BACKSLASH' => 1,
+                        'AC_DISABLE_FAST_INSTALL' => 1,
+                        'AM_SILENT_RULES' => 1,
                        'DX_CHECK_DEPEND' => 1,
                        'DX_FEATURE_pdf' => 1,
-                        '_LT_AC_SYS_LIBPATH_AIX' => 1,
-                        '_LT_AC_TRY_DLOPEN_SELF' => 1,
-                        'DX_REQUIRE_PROG' => 1,
                        'include' => 1,
+                        'DX_REQUIRE_PROG' => 1,
+                        '_LT_AC_TRY_DLOPEN_SELF' => 1,
+                        '_LT_AC_SYS_LIBPATH_AIX' => 1,
                        'LT_AC_PROG_SED' => 1,
                        'AM_ENABLE_SHARED' => 1,
                        'DX_FEATURE_html' => 1,
                        'LTDL_INSTALLABLE' => 1,
-                        '_LT_AC_LANG_GCJ_CONFIG' => 1,
                        'DX_CURRENT_DESCRIPTION' => 1,
+                        '_LT_AC_LANG_GCJ_CONFIG' => 1,
                        'AC_ENABLE_SHARED' => 1,
-                        'AC_ENABLE_STATIC' => 1,
-                        'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1,
                        '_LT_REQUIRED_DARWIN_CHECKS' => 1,
-                        '_LT_AC_TAGVAR' => 1,
-                        'AM_PROG_CC_C_O' => 1,
+                        'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1,
+                        'AC_ENABLE_STATIC' => 1,
                        'AX_FUNC_POSIX_MEMALIGN' => 1,
+                        'AM_PROG_CC_C_O' => 1,
+                        '_LT_AC_TAGVAR' => 1,
                        'AC_LIBTOOL_LANG_F77_CONFIG' => 1,
                        'AM_CONDITIONAL' => 1,
                        'LT_LIB_DLLOAD' => 1,
                        'DX_FEATURE_dot' => 1,
-                        'LTDL_INIT' => 1,
-                        '_LT_PROG_F77' => 1,
-                        '_LT_PROG_CXX' => 1,
                        'LTVERSION_VERSION' => 1,
-                        'AM_PROG_INSTALL_SH' => 1,
+                        '_LT_PROG_CXX' => 1,
+                        '_LT_PROG_F77' => 1,
+                        'LTDL_INIT' => 1,
                        'm4_include' => 1,
+                        'AM_PROG_INSTALL_SH' => 1,
                        'AC_PROG_EGREP' => 1,
-                        '_AC_AM_CONFIG_HEADER_HOOK' => 1,
                        'AC_PATH_MAGIC' => 1,
+                        '_AC_AM_CONFIG_HEADER_HOOK' => 1,
                        'AC_LTDL_SYSSEARCHPATH' => 1,
                        'AM_MAKE_INCLUDE' => 1,
                        'LT_CMD_MAX_LEN' => 1,
@@ -180,51 +180,51 @@
                        'AC_PROG_LD_RELOAD_FLAG' => 1,
                        'DX_FEATURE_chi' => 1,
                        'AC_LTDL_DLSYM_USCORE' => 1,
-                        'LT_LANG' => 1,
                        'AM_MISSING_HAS_RUN' => 1,
+                        'LT_LANG' => 1,
                        'LT_SYS_DLSEARCH_PATH' => 1,
                        'LT_CONFIG_LTDL_DIR' => 1,
-                        'LT_OUTPUT' => 1,
                        'AC_LIBTOOL_DLOPEN_SELF' => 1,
+                        'LT_OUTPUT' => 1,
                        'AC_LIBTOOL_PROG_LD_SHLIBS' => 1,
-                        'AC_LIBTOOL_LINKER_OPTION' => 1,
                        'AC_WITH_LTDL' => 1,
+                        'AC_LIBTOOL_LINKER_OPTION' => 1,
                        'DX_CHI_FEATURE' => 1,
-                        'AC_LIBTOOL_CXX' => 1,
                        'LT_AC_PROG_RC' => 1,
+                        'AC_LIBTOOL_CXX' => 1,
                        'LT_INIT' => 1,
-                        'LT_SYS_DLOPEN_SELF' => 1,
-                        'LT_AC_PROG_GCJ' => 1,
                        'AX_CHECK_COMPILER_FLAGS' => 1,
+                        'LT_AC_PROG_GCJ' => 1,
+                        'LT_SYS_DLOPEN_SELF' => 1,
                        'DX_CURRENT_FEATURE' => 1,
-                        '_LT_AC_PROG_CXXCPP' => 1,
-                        'AM_DISABLE_STATIC' => 1,
                        'AM_DEP_TRACK' => 1,
-                        '_AC_PROG_LIBTOOL' => 1,
+                        'AM_DISABLE_STATIC' => 1,
+                        '_LT_AC_PROG_CXXCPP' => 1,
                        'AM_CONFIG_HEADER' => 1,
-                        '_AM_IF_OPTION' => 1,
+                        '_AC_PROG_LIBTOOL' => 1,
                        'DX_HTML_FEATURE' => 1,
+                        '_AM_IF_OPTION' => 1,
                        'AC_PATH_TOOL_PREFIX' => 1,
-                        'AC_LIBTOOL_F77' => 1,
                        'm4_pattern_allow' => 1,
+                        'AC_LIBTOOL_F77' => 1,
                        'AM_SET_LEADING_DOT' => 1,
-                        'LT_AC_PROG_EGREP' => 1,
                        '_LT_PROG_FC' => 1,
-                        '_AM_DEPENDENCIES' => 1,
+                        'LT_AC_PROG_EGREP' => 1,
                        'DX_DIRNAME_EXPR' => 1,
+                        '_AM_DEPENDENCIES' => 1,
                        'AC_LIBTOOL_LANG_C_CONFIG' => 1,
                        'LTOPTIONS_VERSION' => 1,
                        '_LT_AC_SYS_COMPILER' => 1,
-                        'AM_PROG_NM' => 1,
                        'DX_FEATURE_man' => 1,
+                        'AM_PROG_NM' => 1,
                        'DX_PS_FEATURE' => 1,
                        'AC_LIBLTDL_CONVENIENCE' => 1,
                        'AC_DEPLIBS_CHECK_METHOD' => 1,
-                        'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1,
                        'AC_LIBLTDL_INSTALLABLE' => 1,
+                        'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1,
                        'AC_LTDL_ENABLE_INSTALL' => 1,
-                        'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1,
                        'LT_PROG_GCJ' => 1,
+                        'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1,
                        'DX_FEATURE_chm' => 1,
                        'AM_INIT_AUTOMAKE' => 1,
                        'DX_FEATURE_rtf' => 1,
@@ -235,32 +235,32 @@
                        '_LT_AC_LOCK' => 1,
                        '_LT_AC_LANG_RC_CONFIG' => 1,
                        'LT_PROG_GO' => 1,
-                        'LT_SYS_MODULE_PATH' => 1,
                        'DX_ENV_APPEND' => 1,
-                        'AC_LIBTOOL_POSTDEP_PREDEP' => 1,
+                        'LT_SYS_MODULE_PATH' => 1,
                        'LT_WITH_LTDL' => 1,
-                        'AC_LTDL_SHLIBPATH' => 1,
-                        'AX_GCC_ARCHFLAG' => 1,
+                        'AC_LIBTOOL_POSTDEP_PREDEP' => 1,
                        'DX_ARG_ABLE' => 1,
+                        'AX_GCC_ARCHFLAG' => 1,
+                        'AC_LTDL_SHLIBPATH' => 1,
                        'AX_OPENMP' => 1,
                        'AM_AUX_DIR_EXPAND' => 1,
-                        '_LT_AC_LANG_F77_CONFIG' => 1,
                        'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1,
-                        '_AM_SET_OPTIONS' => 1,
+                        '_LT_AC_LANG_F77_CONFIG' => 1,
                        '_LT_COMPILER_OPTION' => 1,
-                        '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
+                        '_AM_SET_OPTIONS' => 1,
                        'AM_RUN_LOG' => 1,
-                        'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1,
-                        'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1,
+                        '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1,
                        'AC_LIBTOOL_PICMODE' => 1,
-                        'LT_PATH_LD' => 1,
-                        'AC_CHECK_LIBM' => 1,
+                        'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1,
+                        'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1,
                        'ACX_PTHREAD' => 1,
+                        'AC_CHECK_LIBM' => 1,
+                        'LT_PATH_LD' => 1,
                        'AC_LIBTOOL_SYS_LIB_STRIP' => 1,
                        '_AM_MANGLE_OPTION' => 1,
-                        'AC_LTDL_SYMBOL_USCORE' => 1,
-                        'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1,
                        'DX_FEATURE_xml' => 1,
+                        'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1,
+                        'AC_LTDL_SYMBOL_USCORE' => 1,
                        'AM_SET_DEPDIR' => 1,
                        '_LT_CC_BASENAME' => 1,
                        'DX_FEATURE_ps' => 1,
@@ -280,57 +280,57 @@
                        'configure.in'
                      ],
                      {
-                        '_LT_AC_TAGCONFIG' => 1,
                        'AM_PROG_F77_C_O' => 1,
-                        'AC_INIT' => 1,
+                        '_LT_AC_TAGCONFIG' => 1,
                        'm4_pattern_forbid' => 1,
-                        '_AM_COND_IF' => 1,
+                        'AC_INIT' => 1,
                        'AC_CANONICAL_TARGET' => 1,
-                        'AC_SUBST' => 1,
+                        '_AM_COND_IF' => 1,
                        'AC_CONFIG_LIBOBJ_DIR' => 1,
-                        'AC_FC_SRCEXT' => 1,
+                        'AC_SUBST' => 1,
                        'AC_CANONICAL_HOST' => 1,
+                        'AC_FC_SRCEXT' => 1,
                        'AC_PROG_LIBTOOL' => 1,
                        'AM_INIT_AUTOMAKE' => 1,
-                        'AM_PATH_GUILE' => 1,
                        'AC_CONFIG_SUBDIRS' => 1,
+                        'AM_PATH_GUILE' => 1,
                        'AM_AUTOMAKE_VERSION' => 1,
                        'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_REQUIRE_AUX_FILE' => 1,
                        'AC_CONFIG_LINKS' => 1,
-                        'm4_sinclude' => 1,
+                        'AC_REQUIRE_AUX_FILE' => 1,
                        'LT_SUPPORTED_TAG' => 1,
+                        'm4_sinclude' => 1,
                        'AM_MAINTAINER_MODE' => 1,
                        'AM_NLS' => 1,
                        'AC_FC_PP_DEFINE' => 1,
                        'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
-                        'AM_MAKEFILE_INCLUDE' => 1,
                        '_m4_warn' => 1,
+                        'AM_MAKEFILE_INCLUDE' => 1,
                        'AM_PROG_CXX_C_O' => 1,
-                        '_AM_COND_ENDIF' => 1,
                        '_AM_MAKEFILE_INCLUDE' => 1,
+                        '_AM_COND_ENDIF' => 1,
                        'AM_ENABLE_MULTILIB' => 1,
                        'AM_SILENT_RULES' => 1,
                        'AM_PROG_MOC' => 1,
                        'AC_CONFIG_FILES' => 1,
-                        'include' => 1,
                        'LT_INIT' => 1,
-                        'AM_PROG_AR' => 1,
+                        'include' => 1,
                        'AM_GNU_GETTEXT' => 1,
+                        'AM_PROG_AR' => 1,
                        'AC_LIBSOURCE' => 1,
-                        'AM_PROG_FC_C_O' => 1,
                        'AC_CANONICAL_BUILD' => 1,
+                        'AM_PROG_FC_C_O' => 1,
                        'AC_FC_FREEFORM' => 1,
-                        'AH_OUTPUT' => 1,
                        'AC_FC_PP_SRCEXT' => 1,
-                        '_AM_SUBST_NOTMAKE' => 1,
+                        'AH_OUTPUT' => 1,
                        'AC_CONFIG_AUX_DIR' => 1,
-                        'sinclude' => 1,
-                        'AM_PROG_CC_C_O' => 1,
+                        '_AM_SUBST_NOTMAKE' => 1,
                        'm4_pattern_allow' => 1,
-                        'AM_XGETTEXT_OPTION' => 1,
-                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_PROG_CC_C_O' => 1,
+                        'sinclude' => 1,
                        'AM_CONDITIONAL' => 1,
+                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_XGETTEXT_OPTION' => 1,
                        'AC_CONFIG_HEADERS' => 1,
                        'AC_DEFINE_TRACE_LITERAL' => 1,
                        'AM_POT_TOOLS' => 1,
@@ -351,57 +351,57 @@
                        'configure.in'
                      ],
                      {
-                        'AM_PROG_F77_C_O' => 1,
                        '_LT_AC_TAGCONFIG' => 1,
-                        'm4_pattern_forbid' => 1,
+                        'AM_PROG_F77_C_O' => 1,
                        'AC_INIT' => 1,
-                        '_AM_COND_IF' => 1,
+                        'm4_pattern_forbid' => 1,
                        'AC_CANONICAL_TARGET' => 1,
-                        'AC_SUBST' => 1,
+                        '_AM_COND_IF' => 1,
                        'AC_CONFIG_LIBOBJ_DIR' => 1,
-                        'AC_FC_SRCEXT' => 1,
+                        'AC_SUBST' => 1,
                        'AC_CANONICAL_HOST' => 1,
+                        'AC_FC_SRCEXT' => 1,
                        'AC_PROG_LIBTOOL' => 1,
                        'AM_INIT_AUTOMAKE' => 1,
-                        'AM_PATH_GUILE' => 1,
                        'AC_CONFIG_SUBDIRS' => 1,
+                        'AM_PATH_GUILE' => 1,
                        'AM_AUTOMAKE_VERSION' => 1,
                        'LT_CONFIG_LTDL_DIR' => 1,
-                        'AC_CONFIG_LINKS' => 1,
                        'AC_REQUIRE_AUX_FILE' => 1,
-                        'm4_sinclude' => 1,
+                        'AC_CONFIG_LINKS' => 1,
                        'LT_SUPPORTED_TAG' => 1,
+                        'm4_sinclude' => 1,
                        'AM_MAINTAINER_MODE' => 1,
                        'AM_NLS' => 1,
                        'AC_FC_PP_DEFINE' => 1,
                        'AM_GNU_GETTEXT_INTL_SUBDIR' => 1,
-                        'AM_MAKEFILE_INCLUDE' => 1,
                        '_m4_warn' => 1,
+                        'AM_MAKEFILE_INCLUDE' => 1,
                        'AM_PROG_CXX_C_O' => 1,
-                        '_AM_MAKEFILE_INCLUDE' => 1,
                        '_AM_COND_ENDIF' => 1,
+                        '_AM_MAKEFILE_INCLUDE' => 1,
                        'AM_ENABLE_MULTILIB' => 1,
                        'AM_PROG_MOC' => 1,
                        'AM_SILENT_RULES' => 1,
                        'AC_CONFIG_FILES' => 1,
-                        'LT_INIT' => 1,
                        'include' => 1,
-                        'AM_GNU_GETTEXT' => 1,
+                        'LT_INIT' => 1,
                        'AM_PROG_AR' => 1,
+                        'AM_GNU_GETTEXT' => 1,
                        'AC_LIBSOURCE' => 1,
-                        'AM_PROG_FC_C_O' => 1,
                        'AC_CANONICAL_BUILD' => 1,
+                        'AM_PROG_FC_C_O' => 1,
                        'AC_FC_FREEFORM' => 1,
-                        'AH_OUTPUT' => 1,
                        'AC_FC_PP_SRCEXT' => 1,
-                        'AC_CONFIG_AUX_DIR' => 1,
+                        'AH_OUTPUT' => 1,
                        '_AM_SUBST_NOTMAKE' => 1,
-                        'm4_pattern_allow' => 1,
-                        'sinclude' => 1,
+                        'AC_CONFIG_AUX_DIR' => 1,
                        'AM_PROG_CC_C_O' => 1,
-                        'AM_XGETTEXT_OPTION' => 1,
-                        'AC_CANONICAL_SYSTEM' => 1,
+                        'sinclude' => 1,
+                        'm4_pattern_allow' => 1,
                        'AM_CONDITIONAL' => 1,
+                        'AC_CANONICAL_SYSTEM' => 1,
+                        'AM_XGETTEXT_OPTION' => 1,
                        'AC_CONFIG_HEADERS' => 1,
                        'AC_DEFINE_TRACE_LITERAL' => 1,
                        'AM_POT_TOOLS' => 1,

--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -34,6 +34,17 @@
 /* Local includes. */
 #include "quicksched.h"

+void printMatrix(double* Matrix, int m, int n, int tilesize) {
+  int i, j;
+
+  for (i = 0; i < m * tilesize; i++) {
+    for (j = 0; j < n * tilesize; j++) {
+      printf(" %.3f ", Matrix[j * m * tilesize + i]);
+    }
+    printf("\n");
+  }
+}
+
 /**
 * Takes a column major matrix, NOT tile major. size is length of a side of the
 * matrix. Only works for square matrices.
@@ -85,14 +96,13 @@ double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
                  1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
          }
        }
-
        /* Qtemp = Qtemp * (I-tau*w*w')' */
        cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
                    1.0, Qtemp, size, ww, size, 0.0, temp, size);
        double* b = Qtemp;
        Qtemp = temp;
        temp = b;
-      }
+          }
      cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
                  1.0, Q, size, Qtemp, size, 0.0, temp, size);
      double* b = Q;
@@ -120,16 +130,7 @@ double* getR(double* HR, int size) {
  return R;
 }

-void printMatrix(double* Matrix, int m, int n, int tilesize) {
-  int i, j;

-  for (i = 0; i < m * tilesize; i++) {
-    for (j = 0; j < n * tilesize; j++) {
-      printf(" %.3f ", Matrix[j * m * tilesize + i]);
-    }
-    printf("\n");
-  }
-}

 double* columnToTile(double* columnMatrix, int size, int m, int n,
                     int tilesize) {
@@ -395,6 +396,20 @@ void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
  }
 }

+/* Generates a random matrix. */
+double* generateColumnMatrix(int size, unsigned long int m_z) {
+  double* matrix = malloc(sizeof(double) * size);
+  if (matrix == NULL) error("Failed to allocate matrix");
+
+  int i;
+  for (i = 0; i < size; i++) {
+    m_z = (1664525 * m_z + 1013904223) % 4294967296;
+    matrix[i] = m_z % 100;
+    if (matrix[i] < 0) matrix[i] += 100;
+  }
+  return matrix;
+}
+
 /**
 *
 * @brief Applies the householder factorisation of the corner to the row tile.
@@ -511,6 +526,8 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
    else
      A_orig[k] = matrix[k];
  }
+    if(matrix == NULL)
+        A_orig = generateColumnMatrix(m*n*K*K, 35532);
  memcpy(A, A_orig, sizeof(double) * m * n * K * K);
  bzero(tau, sizeof(double) * m * n * K);

@@ -588,7 +605,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
        qsched_addlock(&s, tid_new, rid[j * m + i]);
        qsched_adduse(&s, tid_new, rid[k * m + i]);
        qsched_adduse(&s, tid_new, rid[j * m + k]);
-        // qsched_addunlock(&s, tid[k * m + i], tid_new);
+        qsched_addunlock(&s, tid[k * m + i], tid_new);
        qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
        if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);

@@ -600,11 +617,11 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
  tot_setup = getticks() - tic;

  /* Dump the number of tasks. */
-  message("total nr of tasks: %i.", s.count);
-  message("total nr of deps: %i.", s.count_deps);
-  message("total nr of res: %i.", s.count_res);
-  message("total nr of locks: %i.", s.count_locks);
-  message("total nr of uses: %i.", s.count_uses);
+  //message("total nr of tasks: %i.", s.count);
+  //message("total nr of deps: %i.", s.count_deps);
+  //message("total nr of res: %i.", s.count_res);
+  //message("total nr of locks: %i.", s.count_locks);
+  //message("total nr of uses: %i.", s.count_uses);

  /* Loop over the number of runs. */
  for (k = 0; k < runs; k++) {
@@ -613,7 +630,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
    tic = getticks();
    qsched_run(&s, nr_threads, runner);
    toc_run = getticks();
-    message("%ith run took %lli ticks...", k, toc_run - tic);
+   // message("%ith run took %lli ticks...", k, toc_run - tic);
    tot_run += toc_run - tic;
  }

@@ -643,20 +660,21 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
      } */

  /* Dump the costs. */
-  message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
-          tot_run / runs);
+    double itpms = 1000.0 / CPU_TPS;
+  message("costs: setup=%.3f ms, run=%.3f ms.", ((double)tot_setup)*itpms,
+          ((double)tot_run)*itpms);

  /* Dump the timers. */
-  for (k = 0; k < qsched_timer_count; k++)
+  /*for (k = 0; k < qsched_timer_count; k++)
    message("timer %s is %lli ticks.", qsched_timer_names[k],
-            s.timers[k] / runs);
+            s.timers[k] / runs);*/

  if (matrix != NULL) {
    for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k];
  }

  /* Test if the decomposition was correct.*/
-  /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+/*  double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
   double *Q = computeQ(tempMatrix, m*K, K, tau, m);
   double *R = getR(tempMatrix, m*K);
   cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
@@ -683,19 +701,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
  qsched_free(&s);
 }

-/* Generates a random matrix. */
-double* generateColumnMatrix(int size, unsigned long int m_z) {
-  double* matrix = malloc(sizeof(double) * size * size);
-  if (matrix == NULL) error("Failed to allocate matrix");

-  int i;
-  for (i = 0; i < size * size; i++) {
-    m_z = (1664525 * m_z + 1013904223) % 4294967296;
-    matrix[i] = m_z % 100;
-    if (matrix[i] < 0) matrix[i] += 100;
-  }
-  return matrix;
-}


 /* Unit tests for the tiled QR decomposition.*/
@@ -703,10 +709,10 @@ void test_SSSRFT()
 {
    double* cornerTile, *rowTile, *columnTile, *otherTile;
    double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35136);
-    rowTile = generateColumnMatrix(32, 35239);
-    columnTile = generateColumnMatrix(32, 35339);
-    otherTile = generateColumnMatrix(32, 35739);
+    cornerTile = generateColumnMatrix(32*32, 35136);
+    rowTile = generateColumnMatrix(32*32, 35239);
+    columnTile = generateColumnMatrix(32*32, 35339);
+    otherTile = generateColumnMatrix(32*32, 35739);
 DGEQRF(cornerTile, 32, tau, 0, 2);
    DTSQRF(cornerTile, columnTile,
            32, 1, 0, tau,
@@ -733,8 +739,8 @@ void test_STSQRF()
 {
    double *cornerTile, *columnTile;
    double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35531);
-    columnTile = generateColumnMatrix(32, 35585);
+    cornerTile = generateColumnMatrix(32*32, 35531);
+    columnTile = generateColumnMatrix(32*32, 35585);
    DGEQRF(cornerTile, 32, tau, 0, 2);
    DTSQRF(cornerTile, columnTile,
            32, 1, 0, tau,
@@ -749,8 +755,8 @@ void test_SLARFT()
 {
    double* cornerTile, *rowTile;
    double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35536);
-    rowTile = generateColumnMatrix(32, 35539);
+    cornerTile = generateColumnMatrix(32*32, 35536);
+    rowTile = generateColumnMatrix(32*32, 35539);
    DGEQRF(cornerTile, 32,tau, 0, 2);
    DLARFT(cornerTile,rowTile, 32,
            1, 0, tau, 2); 
@@ -766,7 +772,7 @@ void test_SGEQRF()
 {
    double* cornerTile;
    double tau[1024];
-    cornerTile = generateColumnMatrix(32, 35532);
+    cornerTile = generateColumnMatrix(32*32, 35532);
    DGEQRF( cornerTile, 32,
            tau, 0, 2);
    printMatrix(cornerTile, 1, 1, 32);
@@ -774,10 +780,10 @@ void test_SGEQRF()
 }
 void runTests()
 {
- //   test_SGEQRF();
+   test_SGEQRF();
 //    test_SLARFT();
 //    test_STSQRF();
-    test_SSSRFT();
+//    test_SSSRFT();
 }



--- a/examples/test_qr.cu
+++ b/examples/test_qr.cu
@@ -7,13 +7,18 @@
 #include <string.h>
 #include <unistd.h>
 #include <math.h>
+#include <gperftools/profiler.h>
+
+

 /* Local includes. */
 extern "C"{
 #include "quicksched.h"
+#include <cblas.h>
 }
 #include "cuda_queue.h"

+int g_size;
 enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ;

 //#define TID threadIdx.x
@@ -28,6 +33,7 @@ __device__ float *GPU_tau;
 __device__ int cuda_m;
 __device__ int cuda_n;

+
 void printMatrix(float *matrix, int m, int n, int tilesize)
 {
    int i, j;
@@ -44,16 +50,110 @@ void printMatrix(float *matrix, int m, int n, int tilesize)

 }

+/**
+ * Takes a column major matrix, NOT tile major. size is length of a side of the
+ * matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know
+ * of no implementation to retrieve Q from the tiled QR.
+ */
+float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) {
+  float* Q = (float*)malloc(sizeof(float) * size * size);
+  float* Qtemp = (float*)malloc(sizeof(float) * size * size);
+  float* w = (float*)malloc(sizeof(float) * size);
+  float* ww = (float*)malloc(sizeof(float) * size * size);
+  float* temp = (float*)malloc(sizeof(float) * size * size);
+  int i, k, l, j, n;
+  bzero(Q, sizeof(float) * size * size);
+  bzero(Qtemp, sizeof(float) * size * size);
+  bzero(ww, sizeof(float) * size * size);
+
+  for (i = 0; i < size; i++) {
+    Q[i * size + i] = 1.0;
+  }
+  int numcoltile = size / tilesize;
+  int numrowtile = size / tilesize;
+  for (k = 0; k < numrowtile; k++) {
+    for (l = 0; l < tilesize; l++) {
+      bzero(Qtemp, sizeof(float) * size * size);
+      for (i = 0; i < size; i++) {
+        Qtemp[i * size + i] = 1.0;
+      }
+
+      for (i = k; i < numcoltile; i++) {
+        bzero(w, sizeof(float) * size);
+
+        for (j = 0; j < tilesize; j++) {
+          w[i * tilesize + j] =
+              HR[(k * tilesize + l) * size + i * tilesize + j];
+        }
+        w[k * tilesize + l] = 1.0;
+        if (k * tilesize + l > i * tilesize) {
+          for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
+        }
+
+        /* Compute (I - tau*w*w')' */
+        for (j = 0; j < size; j++) {
+          for (n = 0; n < size; n++) {
+            if (j != n)
+              ww[n * size + j] =
+                  -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+            else
+              ww[n * size + j] =
+                  1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+          }
+        }
+        /* Qtemp = Qtemp * (I-tau*w*w')' */
+        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                    1.0, Qtemp, size, ww, size, 0.0, temp, size);
+        float* b = Qtemp;
+        Qtemp = temp;
+        temp = b;
+            
+      }
+      cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+                  1.0, Q, size, Qtemp, size, 0.0, temp, size);
+      float* b = Q;
+      Q = temp;
+      temp = b;
+    }
+  }
+
+  free(Qtemp);
+  free(w);
+  free(ww);
+  free(temp);
+  return Q;
+}
+
+float* getR(float* HR, int size) {
+  float* R = (float*)malloc(sizeof(float) * size * size);
+  int i, j;
+  bzero(R, sizeof(float) * size * size);
+  for (i = 0; i < size; i++) {
+    for (j = 0; j <= i; j++) {
+      R[i * size + j] = HR[i * size + j];
+    }
+  }
+  return R;
+}
+
+
+
 /* Generates a random matrix. */
 float* generateColumnMatrix(int size, unsigned long int m_z)
 {
    float* matrix;
-    cudaMallocHost(&matrix, sizeof(float)*size*size);
+    
+    cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/);
+    if(code != cudaSuccess)
+        printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size);
+       else
+        g_size = g_size + sizeof(float)*size*size;
    if(matrix == NULL)
        error("Failed to allocate matrix");

    int i;
-    for(i = 0 ; i < size*size; i++)
+    for(i = 0 ; i < size; i++)
    {
        m_z = (1664525*m_z + 1013904223) % 4294967296;
        matrix[i] = m_z % 100;
@@ -454,43 +554,39 @@ __device__ void runner ( int type , void *data ) {
 	workVector = blockCache + (32*32);
        /* Decode the task data. */
        int *idata = (int *)data;
-        int i = idata[0], j = idata[1], k = idata[2];
+        int i = idata[0];
+        int j = idata[1];
+        int k = idata[2];
    //    int z;
 //        double buff[ 2*K*K ];
        /* Decode and execute the task. */
        switch ( type ) {
            case task_SGEQRF:
+                //if(threadIdx.x == 0)
+                  //  printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32);
                if(threadIdx.x < 32)
-//                    runner_cuda_SGEQRF(i, j, k, workVector, blockCache);
-
-                if(threadIdx.x == 0){
-                    printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock());
-                printf("tau = ");
-                for(k = 0; k < cuda_m * cuda_n * 32 ; k++)
-                {
-                    printf("%.3f ", GPU_tau[k]);
-                }
-                printf("\n");}
+                SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m);
                break;
            case task_SLARFT:
-                if(threadIdx.x == 0)
-                    printf("SLARFT: %i %i %i + %i\n",i,j,k, clock());
-                //runner_cuda_SLARFT(i, j, k, workVector, blockCache);
+        //        if(threadIdx.x == 0)
+        //            printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32);
+                SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m);
                break;
            case task_STSQRF:
-                if(threadIdx.x == 0)
-                    printf("STSQRF: %i %i %i + %i\n",i,j,k, clock());
-                //runner_cuda_STSQRF(i, j, k, workVector, blockCache);
+          //      if(threadIdx.x == 0)
+         //           printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32);
+                if(threadIdx.x < 32)
+                    STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m);
                break;
            case task_SSSRFT:
-                if(threadIdx.x == 0)
-                    printf("SSSRFT: %i %i %i + %i\n",i,j,k, clock());
-                //runner_cuda_SSSRFT(i, j, k, workVector, blockCache);
+         //       if(threadIdx.x == 0)
+         //           printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32);
+                SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j,        k, GPU_tau, cuda_m );
                break;
            default:
                asm("trap;");
            }
-
+__syncthreads();
 }

 __device__ qsched_funtype function = runner;
@@ -505,6 +601,7 @@ float* generateMatrix( int m, int n)
 {
    float* Matrix;
    cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32);
+    g_size = g_size + sizeof(float)*m*n*32*32;
    if(Matrix == NULL)
        error("Failed to allocate Matrix");
    int i, j;
@@ -557,40 +654,33 @@ float* columnToTile( float* columnMatrix, int size , int m , int n )

 }

-float* tileToColumn( float* tileMatrix, int size, int m , int n )
-{
-    float* ColumnMatrix;
-    ColumnMatrix = (float*) malloc(sizeof(float) * size );
-    if(ColumnMatrix == NULL)
-        error("failed to allocate ColumnMatrix");
-    int rows = m*32;
-    int columns = n*32;
-    int i,j,k,l;
-    for( i = 0; i < n ; i++ )
-    {
-        for(j = 0; j < m ; j++ )
-        {
-            /* Tile on ith column is at i*m*32*32.*/
-            /* Tile on jth is at j*32*32 */
-            float *tile = &tileMatrix[i*m*32*32 + j*32*32];
-            /* Column starts at same position as tile. */
-            /* Row j*32.*/
-            float *tilePos = &ColumnMatrix[i*m*32*32 + j*32];
-            for( k = 0; k < 32; k++ )
-            {
-                for(l=0; l < 32; l++)
-                {
-                    tilePos[l] = tile[l];
-                }
-                /* Next 32 elements are the position of the tile in the next column.*/
-                tile = &tile[32];
-                /* Move to the j*32th position in the next column. */                
-                tilePos = &tilePos[32*m];
-
-            }
-        }   
+float* tileToColumn(float* tileMatrix, int size, int m, int n, int tilesize) {
+  float* ColumnMatrix;
+  ColumnMatrix = (float*)malloc(sizeof(float) * size);
+  if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+  int i, j, k, l;
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < m; j++) {
+      /* Tile on ith column is at i*m*32*32.*/
+      /* Tile on jth is at j*32*32 */
+      float* tile =
+          &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+      /* Column starts at same position as tile. */
+      /* Row j*32.*/
+      float* tilePos =
+          &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+      for (k = 0; k < tilesize; k++) {
+        for (l = 0; l < tilesize; l++) {
+          tilePos[l] = tile[l];
+        }
+        /* Next 32 elements are the position of the tile in the next column.*/
+        tile = &tile[tilesize];
+        /* Move to the j*32th position in the next column. */
+        tilePos = &tilePos[tilesize * m];
+      }
    }
-    return ColumnMatrix;
+  }
+  return ColumnMatrix;
 }


@@ -672,24 +762,29 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
 /* Initialize the scheduler. */
    qsched_init( &s , 1 , qsched_flag_none );
    cudaDeviceReset();
+    if( cudaPeekAtLastError() != cudaSuccess)
+        error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));    
    cudaSetDevice(0);
    Setup<<<1,1>>>();
-    if(cudaDeviceSynchronize() != cudaSuccess)
+    g_size = 0;
+  if(cudaDeviceSynchronize() != cudaSuccess)
    error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError()));
    
    /* Allocate and fill the original matrix. */
    if(cudaMallocHost(&A, sizeof(float) * m * n * K * K  ) != cudaSuccess ||
-        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess ||
-        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess )
+        cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*||
+        cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K  ) != cudaSuccess*/ )
        error("Failed to allocate matrices.");
-    cudaFreeHost(A_orig);
+    g_size = g_size + sizeof(float) * m * n * K * K  + sizeof(float) * m * n * K;
+    //cudaFreeHost(A_orig);
 //    for ( k = 0 ; k < m * n * K * K ; k++ )
  //      A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f;
-    A_orig = generateMatrix(m, n);
+    A_orig = generateColumnMatrix(m*n*K*K, 35532);
+  //  printMatrix(A_orig, m, n);
    float *temp = columnToTile(A_orig, m * n * K *K,  m , n);
    cudaFreeHost(A_orig);
    A_orig = temp;
-    temp = tileToColumn(A_orig, m*n*K*K, m , n);
+    temp = tileToColumn(A_orig, m*n*K*K, m , n, K);
 //    printMatrix(temp, m, n);
    
 //    printTileMatrix(A_orig, m, n);
@@ -721,6 +816,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
        tid[k] = qsched_task_none;
        if( cudaHostGetDevicePointer(&temp_device_array , &A[k*32*32] , 0) != cudaSuccess)
            error("Failed to get device pointer to matrix");
+
        rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32);
        }

@@ -730,22 +826,20 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
        /* Add kth corner task. */
        data[0] = k; data[1] = k; data[2] = k;
        tid_new = qsched_addtask( &s , task_SGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
-        qsched_addlock( &s , tid_new , rid[ k*m + k ] );
-        if ( tid[ k*m + k ] != -1 )
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-        tid[ k*m + k ] = tid_new;
+        qsched_addlock(&s, tid_new, rid[k * m + k]);
+        if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
+        tid[k * m + k] = tid_new;


        /* Add column tasks on kth row. */
        for ( j = k+1 ; j < n ; j++ ) {
            data[0] = k; data[1] = j; data[2] = k;
            tid_new = qsched_addtask( &s , task_SLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ j*m + k ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
-            if ( tid[ j*m + k ] != -1 )
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
-            tid[ j*m + k ] = tid_new;
+            qsched_addlock(&s, tid_new, rid[j * m + k]);
+            qsched_adduse(&s, tid_new, rid[k * m + k]);
+            qsched_addunlock(&s, tid[k * m + k], tid_new);
+            if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
+            tid[j * m + k] = tid_new;
            }
            
        /* For each following row... */
@@ -754,25 +848,23 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
            /* Add the row tasks for the kth column. */
            data[0] = i; data[1] = k; data[2] = k;
            tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
-            qsched_addlock( &s , tid_new , rid[ k*m + i ] );
-            qsched_adduse( &s , tid_new , rid[ k*m + k ] );
-            qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
-            if ( tid[ k*m + i ] != -1 )
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-            tid[ k*m + i ] = tid_new;
-            
+            qsched_addlock(&s, tid_new, rid[k * m + i]);
+           qsched_adduse(&s, tid_new, rid[k * m + k]);
+            qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
+            if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
+            tid[k * m + i] = tid_new;
+              
            /* Add the inner tasks. */
            for ( j = k+1 ; j < n ; j++ ) {
                data[0] = i; data[1] = j; data[2] = k;
                tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
-                qsched_addlock( &s , tid_new , rid[ j*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ k*m + i ] );
-                qsched_adduse( &s , tid_new , rid[ j*m + k ] );
-                qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
-                qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
-                if ( tid[ j*m + i ] != -1 )
-                    qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
-                tid[ j*m + i ] = tid_new;
+                qsched_addlock(&s, tid_new, rid[j * m + i]);
+                qsched_adduse(&s, tid_new, rid[k * m + i]);
+                qsched_adduse(&s, tid_new, rid[j * m + k]);
+                qsched_addunlock(&s, tid[k * m + i], tid_new);
+                qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
+                if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
+                tid[j * m + i] = tid_new;
                }
        
            }
@@ -787,11 +879,42 @@ void test_qr(int m , int n , int K , int nr_threads , int runs)
    if( cudaMemcpyToSymbol( cuda_n, &n, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess )
        error("Failed to copy n to the device");    

+
        qsched_run_CUDA( &s , func );
-    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
-    A = tileToColumn(A,m * n * K * K, m, n);
-    printMatrix(A, m, n);
+
+    if(cudaMemcpy(  tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess )
+        error("Failed to copy the tau data from the device.");
+
+   // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n);
+   /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+   float *Q = computeQ(tempMatrix, m*K, K, tau, m);
+   float *R = getR(tempMatrix, m*K);
+   cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
+   m*K, R, m*K, 0.0, tempMatrix, m*K);
+       free(Q);
+   // printMatrix(tempMatrix, m, n);
+   // printf("\n\n\n\n");
+       Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+   for(i = 0; i < m * n * K * K; i++)
+   {
+       if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] <
+   0.995))
+           printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
+   tempMatrix[i]);
+   }
+   free(tempMatrix);
+   free(Q);
+   free(R);*/
+
+//    cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice);
+ //   A = tileToColumn(A,m * n * K * K, m, n, K);
+   // printMatrix(A, m, n);
 //    printTileMatrix(A, m , n);
+    struct task* tasks =  qsched_get_timers( &s , s.count );
+    /*for(i = 0; i < s.count; i++)
+    {
+        printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID);
+    }*/

 }

@@ -812,7 +935,7 @@ void test()
    printf("\n \n \n");
    //printTileMatrix(MatrixTile, m, n);
    free(Matrix);
-    Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n);
+    Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32);
    printMatrix(Matrix, m, n);
    free(MatrixTile);
    free(Matrix);
@@ -832,11 +955,11 @@ __global__ void SGEQRF_test(float* cornerTile)
    
    for(i = threadIdx.x; i < 32*32; i+=blockDim.x)
    {
-        printf("Copying value %i\n", i);
+       // printf("Copying value %i\n", i);
        tile[i] = cornerTile[i];
    }

-    printf("blockDim.x = %i\n", blockDim.x);
+   // printf("blockDim.x = %i\n", blockDim.x);
    __syncthreads();
    if(threadIdx.x < 32)
        SGEQRF(tile, 32, tau, 0, 1);
@@ -849,7 +972,7 @@ __global__ void SGEQRF_test(float* cornerTile)
 void test_SGEQRF()
 {
    float* cornerTile;
-    cornerTile = generateColumnMatrix(32, 35532);
+    cornerTile = generateColumnMatrix(32*32, 35532);
    SGEQRF_test<<<1, 128>>>(cornerTile);
    cudaDeviceSynchronize();
    printMatrix(cornerTile, 1, 1, 32);
@@ -886,8 +1009,8 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile)
 void test_SLARFT()
 {
    float* cornerTile, *rowTile;
-    cornerTile = generateColumnMatrix(32, 35536);
-    rowTile = generateColumnMatrix(32, 35539);
+    cornerTile = generateColumnMatrix(32*32, 35536);
+    rowTile = generateColumnMatrix(32*32, 35539);
    SLARFT_test<<<1, 128>>>(cornerTile, rowTile);
    cudaDeviceSynchronize();
    printMatrix(cornerTile, 1, 1, 32);
@@ -923,8 +1046,8 @@ __global__ void STSQRF_test(float* cornerTile, float* columnTile)
 void test_STSQRF()
 {
    float *cornerTile, *columnTile;
-    cornerTile = generateColumnMatrix(32, 35531);
-    columnTile = generateColumnMatrix(32, 35585);
+    cornerTile = generateColumnMatrix(32*32, 35531);
+    columnTile = generateColumnMatrix(32*32, 35585);
    STSQRF_test<<<1,32>>>(cornerTile, columnTile);
    cudaDeviceSynchronize();
    printMatrix(cornerTile, 1, 1, 32);
@@ -971,10 +1094,10 @@ __global__ void SSSRFT_test(float* cornerTile, float* columnTile, float* rowTile
 void test_SSSRFT()
 {
    float* cornerTile, *rowTile, *columnTile, *otherTile;
-    cornerTile = generateColumnMatrix(32, 35136);
-    rowTile = generateColumnMatrix(32, 35239);
-    columnTile = generateColumnMatrix(32, 35339);
-    otherTile = generateColumnMatrix(32, 35739);
+    cornerTile = generateColumnMatrix(32*32, 35136);
+    rowTile = generateColumnMatrix(32*32, 35239);
+    columnTile = generateColumnMatrix(32*32, 35339);
+    otherTile = generateColumnMatrix(32*32, 35739);
    SSSRFT_test<<<1, 128>>>(cornerTile, columnTile, rowTile, otherTile);
    cudaDeviceSynchronize();
    printMatrix(cornerTile, 1, 1, 32);
@@ -995,10 +1118,10 @@ void test_SSSRFT()

 void runTests()
 {
-   // test_SGEQRF();
+    test_SGEQRF();
  //  test_SLARFT();
 //   test_STSQRF();
-    test_SSSRFT();
+ //   test_SSSRFT();
 }
 /**
 * @brief Main function.
@@ -1056,5 +1179,6 @@ int main ( int argc , char *argv[] ) {
        
    test_qr( M , N , K , nr_threads , runs );
    
+    printf("\n");
    }

--- a/src/cuda_queue.cu
+++ b/src/cuda_queue.cu
--- a/src/cuda_queue.h
+++ b/src/cuda_queue.h
@@ -16,7 +16,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 ******************************************************************************/
-#define cuda_numqueues 2
+#define cuda_numqueues 3

 /** Type definition for the execution function in #qsched_run. */
 typedef void (*qsched_funtype)( int , void * );
@@ -38,6 +38,8 @@ struct queue_cuda {
    
    /* The recycling list. */
    volatile int *rec_data;
+    
+    volatile int nr_avail_tasks;

    };


--- a/src/qsched.c
+++ b/src/qsched.c
@@ -95,7 +95,7 @@ void qsched_res_own ( struct qsched *s , qsched_res_t res , int owner ) {
 void qsched_ensure ( struct qsched *s , int nr_tasks , int nr_res , int nr_deps , int nr_locks , int nr_uses , int size_data ) {

    int dirty = 0;
-    
+    printf("Called qsched_ensure.\n");
    /* Re-allocate tasks? */
    if ( s->size < nr_tasks ) {
        dirty = 1;
@@ -787,71 +787,6 @@ struct task* qsched_gettask ( struct qsched *s , int qid ) {

    }

-/**
- * @brief Sort the data according to the given indices.
- *
- * @param data The data to be sorted
- * @param ind The indices with respect to which the data are sorted.
- * @param N The number of entries
- * @param min Lowest index.
- * @param max highest index.
- *
- * This function calls qsched_quicksort.
- */
- 
-void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) {
-    int *new_data;
-    int *new_ind;
-    int i;
-    if(N <= 0)
-        return;
-    new_data = (int*)malloc(sizeof(int) * N);
-    if(new_data == NULL)
-        error("Failed to allocate new_data");
-    new_ind = (int*)malloc(sizeof(int) * N);
-    if(new_ind == NULL)
-        error("Failed to allocate new_ind");
-
-    /*Create buckets of size ? - Ideally <16 elements per bucket. Use max-min / N * 10 ? Should give average of 10 elements per bucket */
-    int bucketsize = 1;
-
-    /* To find bucket do ind-min / b and it goes in that bucket.*/
-    int num_buckets = (max-min) / bucketsize +1;
-    int *bucket_inds = (int*) malloc(sizeof(int) * num_bucket);
-    if(bucket_inds == NULL)
-        error("Failed to allocate bucket_inds");
-    memset(bucket_inds,0, sizeof(int)*num_buckets);
-    for(i = 0; i < N; i++)
-    {
-        bucket_inds[(ind[i]-min)]++;
-    }
-    for(i = 1; i < num_buckets; i++ )
-    {
-        bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
-    }
-    /* bucket_inds[i] contains the starting position for the i+1th bucket*/
-    for(i = num_buckets-1; i >0; i--)
-    {
-        bucket_inds[i] = bucket_inds[i-1];
-    }
-    bucket_inds[0] = 0;
-
-    for(i = 0; i < N; i++)
-    {
-        int z = (ind[i]-min);
-        new_data[bucket_inds[z]] = data[i];
-        new_ind[bucket_inds[z]++] = ind[i];
-    }
-
-    /* Copy data back to data and ind and deallocate everything!*/
-    memcpy(data, new_data, N*sizeof(int));
-    memcpy(ind, new_ind, N*sizeof(int));
-    free(new_data);
-    free(new_ind);
-    free(bucket_inds);
-
-}
-
 /**
 * @brief Sort the data according to the given indices.
 *
@@ -872,7 +807,6 @@ void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min
    
    /* If N is small enough, just do insert sort. */
    if ( N < 16 ) {
-        printf("%i\n", N);
        for ( i = 1 ; i < N ; i++ )
            if ( ind[i] < ind[i-1] ) {
                temp_i = ind[i];
@@ -934,6 +868,167 @@ void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min
        
    }

+int log_base2(unsigned int v)
+{
+const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000};
+const unsigned int S[] = {1, 2, 4, 8, 16};
+int i;
+
+register unsigned int r = 0; // result of log2(v) will go here
+for (i = 4; i >= 0; i--) // unroll for speed...
+{
+  if (v & b[i])
+  {
+    v >>= S[i];
+    r |= S[i];
+  } 
+}
+return r;
+}
+/**
+ * @brief Sort the data according to the given indices.
+ *
+ * @param data The data to be sorted
+ * @param ind The indices with respect to which the data are sorted.
+ * @param N The number of entries
+ * @param min Lowest index.
+ * @param max highest index.
+ *
+ * This function calls qsched_quicksort.
+ */
+ 
+void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) {
+    int *new_data;
+    int *new_ind;
+    int i, j, temp_i, temp_d;
+  //  printf("N=%i, max =%i, min = %i\n", N, max, min);
+    if(N <= 0)
+        return;
+ //   if((max-min) > 10*N)
+  //  {
+   //     qsched_quicksort(data, ind, N, min, max);
+   //     return;
+   // }
+//    printf("sort size: %i, max-min = %i\n",N, max-min);
+    /* If N is small enough, just do insert sort. */
+    if ( N < 11 ) {
+//        printf("%i\n", N);
+        for ( i = 1 ; i < N ; i++ )
+            if ( ind[i] < ind[i-1] ) {
+                temp_i = ind[i];
+                temp_d = data[i];
+                for ( j = i ; j > 0 && ind[j-1] > temp_i ; j-- ) {
+                    ind[j] = ind[j-1];
+                    data[j] = data[j-1];
+                    }
+                ind[j] = temp_i;
+                data[j] = temp_d;
+                }
+        return;
+        }
+
+    if((max-min) > 8*N)
+    {
+        int log = log_base2(max-min)-2;
+        log = log_base2((max-min)/8);
+    //    int bucketsize = 1 << log;
+        /* Each bucket contains (max-min)/10 width.*/
+        new_data = (int*)malloc(sizeof(int)*N);
+        new_ind = (int*)malloc(sizeof(int)*N);
+        int num_buckets = ((max-min) >> log) +1;
+        int *bucket_inds = (int*)malloc(sizeof(int)*num_buckets);
+        memset(bucket_inds, 0, sizeof(int)*num_buckets);
+        for(i = 0; i < N; i++)
+        {
+           // printf("%i\n", (ind[i]-min) >> log);
+            if(((ind[i]-min) >> log ) > num_buckets)
+                error("Exceeded num_buckets, ind[i]-min = %i, log = %i, ind[i]-min>>log=%i, ind[i] = %i, max = %i, num_buckets = %i, N=%i", ind[i]-min, log, (ind[i]-min)>>log, ind[i], max, num_buckets, N);
+            bucket_inds[(ind[i]-min) >> log]++;
+        }
+        for(i = 1; i < num_buckets; i++)
+        {
+            bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
+        }
+        for(i = num_buckets-1; i > 0; i--)
+        {
+            bucket_inds[i] = bucket_inds[i-1];
+        }
+        bucket_inds[0] = 0;
+        for(i = 0; i < N; i++)
+        {
+            int z = (ind[i]-min) >> log;
+            if(ind[i] > max || ind[i] < min)
+                error("ind[i]=%i, max = %i, min = %i\n", ind[i], max, min);
+
+            new_data[bucket_inds[z]] = data[i];
+            new_ind[bucket_inds[z]++] = ind[i];
+        }
+        memcpy(data, new_data, N*sizeof(int));
+        memcpy(ind, new_ind, N*sizeof(int));
+        free(new_data);
+        free(new_ind);
+        for(i = num_buckets-1; i > 0; i--)
+        {
+            bucket_inds[i] = bucket_inds[i-1];
+        }
+        bucket_inds[0] = 0;
+        /*Recusively sort each bucket. */
+        for(i = 0; i < num_buckets-1; i++)
+        {
+//            printf("bucket_inds[i] = %i, N = %i, i = %i, num_buckets = %i\n", bucket_inds[i], N, i, num_buckets);
+            qsched_sort(&data[bucket_inds[i]], &ind[bucket_inds[i]], bucket_inds[i+1] - bucket_inds[i], min + (1<<log)*i, min + (1<<log)*(i+1) );
+        }
+        return;
+    }
+    new_data = (int*)malloc(sizeof(int) * N);
+    if(new_data == NULL)
+        error("Failed to allocate new_data");
+    new_ind = (int*)malloc(sizeof(int) * N);
+    if(new_ind == NULL)
+        error("Failed to allocate new_ind");
+
+
+    int bucketsize = 1;
+
+    /* To find bucket do ind-min / b and it goes in that bucket.*/
+    int num_buckets = (max-min) / bucketsize +1;
+    int *bucket_inds = (int*) malloc(sizeof(int) * num_buckets);
+    if(bucket_inds == NULL)
+        error("Failed to allocate bucket_inds, max = %i, min = %i", max, min);
+    memset(bucket_inds,0, sizeof(int)*num_buckets);
+    for(i = 0; i < N; i++)
+    {
+        bucket_inds[(ind[i]-min)]++;
+    }
+    for(i = 1; i < num_buckets; i++ )
+    {
+        bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1];
+    }
+    /* bucket_inds[i] contains the starting position for the i+1th bucket*/
+    for(i = num_buckets-1; i >0; i--)
+    {
+        bucket_inds[i] = bucket_inds[i-1];
+    }
+    bucket_inds[0] = 0;
+
+    for(i = 0; i < N; i++)
+    {
+        int z = (ind[i]-min);
+        new_data[bucket_inds[z]] = data[i];
+        new_ind[bucket_inds[z]++] = ind[i];
+    }
+
+    /* Copy data back to data and ind and deallocate everything!*/
+    memcpy(data, new_data, N*sizeof(int));
+    memcpy(ind, new_ind, N*sizeof(int));
+    free(new_data);
+    free(new_ind);
+    free(bucket_inds);
+
+}
+
+
+

 /**
 * @brief Prepare a #qsched for execution.
@@ -1275,7 +1370,6 @@ void qsched_addunlock ( struct qsched *s , int ta , int tb ) {
    
    /* Do the deps need to be re-allocated? */
    if ( s->count_deps == s->size_deps ) {
-    
        /* Scale the deps list size. */
        s->size_deps *= qsched_stretch;
        
@@ -1337,7 +1431,6 @@ int qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *da
    
    /* Do the tasks need to be re-allocated? */
    if ( s->count == s->size ) {
-    
        /* Scale the task list size. */
        s->size *= qsched_stretch;
        

--- a/src/task.h
+++ b/src/task.h
@@ -54,6 +54,10 @@ struct task {
    int qid;
    
    /* Task weight for queue selection. */
-    int cost, weight; 
+    int cost, weight;
+
+    #ifdef WITH_CUDA
+    int blockID;
+    #endif 

    };