diff --git a/autom4te.cache/requests b/autom4te.cache/requests index 7675eba62587b513c5752f30ea1140a2981e22c3..6cd5e4a3a44e4986e168bef0c16985538948cd38 100644 --- a/autom4te.cache/requests +++ b/autom4te.cache/requests @@ -58,9 +58,9 @@ '_LT_AC_LANG_CXX_CONFIG' => 1, 'AM_PROG_MKDIR_P' => 1, 'DX_IF_FEATURE' => 1, - 'DX_FEATURE_doc' => 1, 'AM_AUTOMAKE_VERSION' => 1, 'DX_CHM_FEATURE' => 1, + 'DX_FEATURE_doc' => 1, 'DX_PDF_FEATURE' => 1, 'AM_SUBST_NOTMAKE' => 1, 'AM_MISSING_PROG' => 1, @@ -68,11 +68,11 @@ '_LT_AC_LANG_C_CONFIG' => 1, 'AM_PROG_INSTALL_STRIP' => 1, '_m4_warn' => 1, - 'AX_GCC_X86_CPUID' => 1, 'AC_LIBTOOL_OBJDIR' => 1, + 'AX_GCC_X86_CPUID' => 1, 'gl_FUNC_ARGZ' => 1, - 'LTOBSOLETE_VERSION' => 1, 'AM_SANITY_CHECK' => 1, + 'LTOBSOLETE_VERSION' => 1, 'AC_LIBTOOL_LANG_GCJ_CONFIG' => 1, 'AC_LIBTOOL_PROG_COMPILER_PIC' => 1, 'LT_LIB_M' => 1, @@ -85,21 +85,21 @@ 'AC_LIBTOOL_GCJ' => 1, 'DX_CLEAR_DEPEND' => 1, '_LT_WITH_SYSROOT' => 1, - 'LT_SYS_DLOPEN_DEPLIBS' => 1, 'LT_FUNC_DLSYM_USCORE' => 1, - 'AC_LIBTOOL_CONFIG' => 1, + 'LT_SYS_DLOPEN_DEPLIBS' => 1, '_LT_AC_LANG_F77' => 1, - 'AC_LTDL_DLLIB' => 1, + 'AC_LIBTOOL_CONFIG' => 1, '_AM_SUBST_NOTMAKE' => 1, + 'AC_LTDL_DLLIB' => 1, '_AM_AUTOCONF_VERSION' => 1, 'AM_DISABLE_SHARED' => 1, '_LT_PROG_ECHO_BACKSLASH' => 1, '_LTDL_SETUP' => 1, - 'AM_PROG_LIBTOOL' => 1, '_LT_AC_LANG_CXX' => 1, - 'AM_PROG_LD' => 1, - '_LT_AC_FILE_LTDLL_C' => 1, + 'AM_PROG_LIBTOOL' => 1, 'AC_LIB_LTDL' => 1, + '_LT_AC_FILE_LTDLL_C' => 1, + 'AM_PROG_LD' => 1, 'DX_DOXYGEN_FEATURE' => 1, 'AU_DEFUN' => 1, 'AC_PROG_NM' => 1, @@ -113,58 +113,58 @@ '_AM_SET_OPTION' => 1, 'AC_LTDL_PREOPEN' => 1, '_LT_LINKER_BOILERPLATE' => 1, - '_LT_PREPARE_SED_QUOTE_VARS' => 1, - 'AC_LIBTOOL_PROG_CC_C_O' => 1, 'AC_LIBTOOL_LANG_CXX_CONFIG' => 1, + 'AC_LIBTOOL_PROG_CC_C_O' => 1, + '_LT_PREPARE_SED_QUOTE_VARS' => 1, 'gl_PREREQ_ARGZ' => 1, 'AX_EXT' => 1, - 'LT_SUPPORTED_TAG' => 1, 'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1, - 'LT_PROG_RC' => 1, + 'LT_SUPPORTED_TAG' => 1, 'LT_SYS_MODULE_EXT' => 1, + 'LT_PROG_RC' => 1, 'AC_DEFUN_ONCE' => 1, 'AX_CHECK_COMPILE_FLAG' => 1, 'DX_XML_FEATURE' => 1, '_LT_AC_LANG_GCJ' => 1, 'AC_LTDL_OBJDIR' => 1, - 'DX_TEST_FEATURE' => 1, '_LT_PATH_TOOL_PREFIX' => 1, + 'DX_TEST_FEATURE' => 1, 'AC_LIBTOOL_RC' => 1, - '_LT_AC_PROG_ECHO_BACKSLASH' => 1, - 'AC_DISABLE_FAST_INSTALL' => 1, 'AM_SILENT_RULES' => 1, + 'AC_DISABLE_FAST_INSTALL' => 1, + '_LT_AC_PROG_ECHO_BACKSLASH' => 1, 'DX_CHECK_DEPEND' => 1, 'DX_FEATURE_pdf' => 1, - 'include' => 1, - 'DX_REQUIRE_PROG' => 1, - '_LT_AC_TRY_DLOPEN_SELF' => 1, '_LT_AC_SYS_LIBPATH_AIX' => 1, + '_LT_AC_TRY_DLOPEN_SELF' => 1, + 'DX_REQUIRE_PROG' => 1, + 'include' => 1, 'LT_AC_PROG_SED' => 1, 'AM_ENABLE_SHARED' => 1, 'DX_FEATURE_html' => 1, 'LTDL_INSTALLABLE' => 1, - 'DX_CURRENT_DESCRIPTION' => 1, '_LT_AC_LANG_GCJ_CONFIG' => 1, + 'DX_CURRENT_DESCRIPTION' => 1, 'AC_ENABLE_SHARED' => 1, - '_LT_REQUIRED_DARWIN_CHECKS' => 1, - 'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1, 'AC_ENABLE_STATIC' => 1, - 'AX_FUNC_POSIX_MEMALIGN' => 1, - 'AM_PROG_CC_C_O' => 1, + 'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1, + '_LT_REQUIRED_DARWIN_CHECKS' => 1, '_LT_AC_TAGVAR' => 1, + 'AM_PROG_CC_C_O' => 1, + 'AX_FUNC_POSIX_MEMALIGN' => 1, 'AC_LIBTOOL_LANG_F77_CONFIG' => 1, 'AM_CONDITIONAL' => 1, 'LT_LIB_DLLOAD' => 1, 'DX_FEATURE_dot' => 1, - 'LTVERSION_VERSION' => 1, - '_LT_PROG_CXX' => 1, - '_LT_PROG_F77' => 1, 'LTDL_INIT' => 1, - 'm4_include' => 1, + '_LT_PROG_F77' => 1, + '_LT_PROG_CXX' => 1, + 'LTVERSION_VERSION' => 1, 'AM_PROG_INSTALL_SH' => 1, + 'm4_include' => 1, 'AC_PROG_EGREP' => 1, - 'AC_PATH_MAGIC' => 1, '_AC_AM_CONFIG_HEADER_HOOK' => 1, + 'AC_PATH_MAGIC' => 1, 'AC_LTDL_SYSSEARCHPATH' => 1, 'AM_MAKE_INCLUDE' => 1, 'LT_CMD_MAX_LEN' => 1, @@ -180,51 +180,51 @@ 'AC_PROG_LD_RELOAD_FLAG' => 1, 'DX_FEATURE_chi' => 1, 'AC_LTDL_DLSYM_USCORE' => 1, - 'AM_MISSING_HAS_RUN' => 1, 'LT_LANG' => 1, + 'AM_MISSING_HAS_RUN' => 1, 'LT_SYS_DLSEARCH_PATH' => 1, 'LT_CONFIG_LTDL_DIR' => 1, - 'AC_LIBTOOL_DLOPEN_SELF' => 1, 'LT_OUTPUT' => 1, + 'AC_LIBTOOL_DLOPEN_SELF' => 1, 'AC_LIBTOOL_PROG_LD_SHLIBS' => 1, - 'AC_WITH_LTDL' => 1, 'AC_LIBTOOL_LINKER_OPTION' => 1, + 'AC_WITH_LTDL' => 1, 'DX_CHI_FEATURE' => 1, - 'LT_AC_PROG_RC' => 1, 'AC_LIBTOOL_CXX' => 1, + 'LT_AC_PROG_RC' => 1, 'LT_INIT' => 1, - 'AX_CHECK_COMPILER_FLAGS' => 1, - 'LT_AC_PROG_GCJ' => 1, 'LT_SYS_DLOPEN_SELF' => 1, + 'LT_AC_PROG_GCJ' => 1, + 'AX_CHECK_COMPILER_FLAGS' => 1, 'DX_CURRENT_FEATURE' => 1, - 'AM_DEP_TRACK' => 1, - 'AM_DISABLE_STATIC' => 1, '_LT_AC_PROG_CXXCPP' => 1, - 'AM_CONFIG_HEADER' => 1, + 'AM_DISABLE_STATIC' => 1, + 'AM_DEP_TRACK' => 1, '_AC_PROG_LIBTOOL' => 1, - 'DX_HTML_FEATURE' => 1, + 'AM_CONFIG_HEADER' => 1, '_AM_IF_OPTION' => 1, + 'DX_HTML_FEATURE' => 1, 'AC_PATH_TOOL_PREFIX' => 1, - 'm4_pattern_allow' => 1, 'AC_LIBTOOL_F77' => 1, + 'm4_pattern_allow' => 1, 'AM_SET_LEADING_DOT' => 1, - '_LT_PROG_FC' => 1, 'LT_AC_PROG_EGREP' => 1, - 'DX_DIRNAME_EXPR' => 1, + '_LT_PROG_FC' => 1, '_AM_DEPENDENCIES' => 1, + 'DX_DIRNAME_EXPR' => 1, 'AC_LIBTOOL_LANG_C_CONFIG' => 1, 'LTOPTIONS_VERSION' => 1, '_LT_AC_SYS_COMPILER' => 1, - 'DX_FEATURE_man' => 1, 'AM_PROG_NM' => 1, + 'DX_FEATURE_man' => 1, 'DX_PS_FEATURE' => 1, 'AC_LIBLTDL_CONVENIENCE' => 1, 'AC_DEPLIBS_CHECK_METHOD' => 1, - 'AC_LIBLTDL_INSTALLABLE' => 1, 'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1, + 'AC_LIBLTDL_INSTALLABLE' => 1, 'AC_LTDL_ENABLE_INSTALL' => 1, - 'LT_PROG_GCJ' => 1, 'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1, + 'LT_PROG_GCJ' => 1, 'DX_FEATURE_chm' => 1, 'AM_INIT_AUTOMAKE' => 1, 'DX_FEATURE_rtf' => 1, @@ -235,32 +235,32 @@ '_LT_AC_LOCK' => 1, '_LT_AC_LANG_RC_CONFIG' => 1, 'LT_PROG_GO' => 1, - 'DX_ENV_APPEND' => 1, 'LT_SYS_MODULE_PATH' => 1, - 'LT_WITH_LTDL' => 1, + 'DX_ENV_APPEND' => 1, 'AC_LIBTOOL_POSTDEP_PREDEP' => 1, - 'DX_ARG_ABLE' => 1, - 'AX_GCC_ARCHFLAG' => 1, + 'LT_WITH_LTDL' => 1, 'AC_LTDL_SHLIBPATH' => 1, + 'AX_GCC_ARCHFLAG' => 1, + 'DX_ARG_ABLE' => 1, 'AX_OPENMP' => 1, 'AM_AUX_DIR_EXPAND' => 1, - 'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1, '_LT_AC_LANG_F77_CONFIG' => 1, - '_LT_COMPILER_OPTION' => 1, + 'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1, '_AM_SET_OPTIONS' => 1, - 'AM_RUN_LOG' => 1, + '_LT_COMPILER_OPTION' => 1, '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1, - 'AC_LIBTOOL_PICMODE' => 1, - 'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1, + 'AM_RUN_LOG' => 1, 'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1, - 'ACX_PTHREAD' => 1, - 'AC_CHECK_LIBM' => 1, + 'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1, + 'AC_LIBTOOL_PICMODE' => 1, 'LT_PATH_LD' => 1, + 'AC_CHECK_LIBM' => 1, + 'ACX_PTHREAD' => 1, 'AC_LIBTOOL_SYS_LIB_STRIP' => 1, '_AM_MANGLE_OPTION' => 1, - 'DX_FEATURE_xml' => 1, - 'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1, 'AC_LTDL_SYMBOL_USCORE' => 1, + 'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1, + 'DX_FEATURE_xml' => 1, 'AM_SET_DEPDIR' => 1, '_LT_CC_BASENAME' => 1, 'DX_FEATURE_ps' => 1, @@ -280,57 +280,57 @@ 'configure.in' ], { - 'AM_PROG_F77_C_O' => 1, '_LT_AC_TAGCONFIG' => 1, - 'm4_pattern_forbid' => 1, + 'AM_PROG_F77_C_O' => 1, 'AC_INIT' => 1, - 'AC_CANONICAL_TARGET' => 1, + 'm4_pattern_forbid' => 1, '_AM_COND_IF' => 1, - 'AC_CONFIG_LIBOBJ_DIR' => 1, + 'AC_CANONICAL_TARGET' => 1, 'AC_SUBST' => 1, - 'AC_CANONICAL_HOST' => 1, + 'AC_CONFIG_LIBOBJ_DIR' => 1, 'AC_FC_SRCEXT' => 1, + 'AC_CANONICAL_HOST' => 1, 'AC_PROG_LIBTOOL' => 1, 'AM_INIT_AUTOMAKE' => 1, - 'AC_CONFIG_SUBDIRS' => 1, 'AM_PATH_GUILE' => 1, + 'AC_CONFIG_SUBDIRS' => 1, 'AM_AUTOMAKE_VERSION' => 1, 'LT_CONFIG_LTDL_DIR' => 1, - 'AC_CONFIG_LINKS' => 1, 'AC_REQUIRE_AUX_FILE' => 1, - 'LT_SUPPORTED_TAG' => 1, + 'AC_CONFIG_LINKS' => 1, 'm4_sinclude' => 1, + 'LT_SUPPORTED_TAG' => 1, 'AM_MAINTAINER_MODE' => 1, 'AM_NLS' => 1, 'AC_FC_PP_DEFINE' => 1, 'AM_GNU_GETTEXT_INTL_SUBDIR' => 1, - '_m4_warn' => 1, 'AM_MAKEFILE_INCLUDE' => 1, + '_m4_warn' => 1, 'AM_PROG_CXX_C_O' => 1, - '_AM_MAKEFILE_INCLUDE' => 1, '_AM_COND_ENDIF' => 1, + '_AM_MAKEFILE_INCLUDE' => 1, 'AM_ENABLE_MULTILIB' => 1, 'AM_SILENT_RULES' => 1, 'AM_PROG_MOC' => 1, 'AC_CONFIG_FILES' => 1, - 'LT_INIT' => 1, 'include' => 1, - 'AM_GNU_GETTEXT' => 1, + 'LT_INIT' => 1, 'AM_PROG_AR' => 1, + 'AM_GNU_GETTEXT' => 1, 'AC_LIBSOURCE' => 1, - 'AC_CANONICAL_BUILD' => 1, 'AM_PROG_FC_C_O' => 1, + 'AC_CANONICAL_BUILD' => 1, 'AC_FC_FREEFORM' => 1, - 'AC_FC_PP_SRCEXT' => 1, 'AH_OUTPUT' => 1, - 'AC_CONFIG_AUX_DIR' => 1, + 'AC_FC_PP_SRCEXT' => 1, '_AM_SUBST_NOTMAKE' => 1, - 'm4_pattern_allow' => 1, - 'AM_PROG_CC_C_O' => 1, + 'AC_CONFIG_AUX_DIR' => 1, 'sinclude' => 1, - 'AM_CONDITIONAL' => 1, - 'AC_CANONICAL_SYSTEM' => 1, + 'AM_PROG_CC_C_O' => 1, + 'm4_pattern_allow' => 1, 'AM_XGETTEXT_OPTION' => 1, + 'AC_CANONICAL_SYSTEM' => 1, + 'AM_CONDITIONAL' => 1, 'AC_CONFIG_HEADERS' => 1, 'AC_DEFINE_TRACE_LITERAL' => 1, 'AM_POT_TOOLS' => 1, @@ -351,57 +351,57 @@ 'configure.in' ], { - '_LT_AC_TAGCONFIG' => 1, 'AM_PROG_F77_C_O' => 1, - 'AC_INIT' => 1, + '_LT_AC_TAGCONFIG' => 1, 'm4_pattern_forbid' => 1, - 'AC_CANONICAL_TARGET' => 1, + 'AC_INIT' => 1, '_AM_COND_IF' => 1, - 'AC_CONFIG_LIBOBJ_DIR' => 1, + 'AC_CANONICAL_TARGET' => 1, 'AC_SUBST' => 1, - 'AC_CANONICAL_HOST' => 1, + 'AC_CONFIG_LIBOBJ_DIR' => 1, 'AC_FC_SRCEXT' => 1, + 'AC_CANONICAL_HOST' => 1, 'AC_PROG_LIBTOOL' => 1, 'AM_INIT_AUTOMAKE' => 1, - 'AC_CONFIG_SUBDIRS' => 1, 'AM_PATH_GUILE' => 1, + 'AC_CONFIG_SUBDIRS' => 1, 'AM_AUTOMAKE_VERSION' => 1, 'LT_CONFIG_LTDL_DIR' => 1, - 'AC_REQUIRE_AUX_FILE' => 1, 'AC_CONFIG_LINKS' => 1, - 'LT_SUPPORTED_TAG' => 1, + 'AC_REQUIRE_AUX_FILE' => 1, 'm4_sinclude' => 1, + 'LT_SUPPORTED_TAG' => 1, 'AM_MAINTAINER_MODE' => 1, 'AM_NLS' => 1, 'AC_FC_PP_DEFINE' => 1, 'AM_GNU_GETTEXT_INTL_SUBDIR' => 1, - '_m4_warn' => 1, 'AM_MAKEFILE_INCLUDE' => 1, + '_m4_warn' => 1, 'AM_PROG_CXX_C_O' => 1, - '_AM_COND_ENDIF' => 1, '_AM_MAKEFILE_INCLUDE' => 1, + '_AM_COND_ENDIF' => 1, 'AM_ENABLE_MULTILIB' => 1, 'AM_PROG_MOC' => 1, 'AM_SILENT_RULES' => 1, 'AC_CONFIG_FILES' => 1, - 'include' => 1, 'LT_INIT' => 1, - 'AM_PROG_AR' => 1, + 'include' => 1, 'AM_GNU_GETTEXT' => 1, + 'AM_PROG_AR' => 1, 'AC_LIBSOURCE' => 1, - 'AC_CANONICAL_BUILD' => 1, 'AM_PROG_FC_C_O' => 1, + 'AC_CANONICAL_BUILD' => 1, 'AC_FC_FREEFORM' => 1, - 'AC_FC_PP_SRCEXT' => 1, 'AH_OUTPUT' => 1, - '_AM_SUBST_NOTMAKE' => 1, + 'AC_FC_PP_SRCEXT' => 1, 'AC_CONFIG_AUX_DIR' => 1, - 'AM_PROG_CC_C_O' => 1, - 'sinclude' => 1, + '_AM_SUBST_NOTMAKE' => 1, 'm4_pattern_allow' => 1, - 'AM_CONDITIONAL' => 1, - 'AC_CANONICAL_SYSTEM' => 1, + 'sinclude' => 1, + 'AM_PROG_CC_C_O' => 1, 'AM_XGETTEXT_OPTION' => 1, + 'AC_CANONICAL_SYSTEM' => 1, + 'AM_CONDITIONAL' => 1, 'AC_CONFIG_HEADERS' => 1, 'AC_DEFINE_TRACE_LITERAL' => 1, 'AM_POT_TOOLS' => 1, diff --git a/examples/test_qr.c b/examples/test_qr.c index 6f77203afbba2c5b3e1c0a953eb9b89323609187..ada33a33acf4833ed478e10bcd0f0cadf466a5e5 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -28,283 +28,472 @@ #include <unistd.h> #include <math.h> #include <omp.h> +#include <pthread.h> + + + -/* LAPACKE header. */ -#include <lapacke.h> #include <cblas.h> /* Local includes. */ #include "quicksched.h" -/* - * Sam's routines for the tiled QR decomposition. - */ - -/* - \brief Computes 2-norm of a vector \f$x\f$ - - Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f] +/** + * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices. + * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR. */ -float do2norm(float* x, int l) +double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) { - float sum = 0, norm; - int i; + double* Q = malloc(sizeof(double)*size*size); + double* Qtemp = malloc(sizeof(double)*size*size); + double* w = malloc(sizeof(double)*size); + double* ww = malloc(sizeof(double)*size*size); + double* temp = malloc(sizeof(double)*size*size); + int i, k, l, j, n; + bzero(Q, sizeof(double)*size*size); + bzero(Qtemp, sizeof(double)*size*size); + bzero(ww, sizeof(double)*size*size); + for(i = 0; i < size; i++) + { + Q[i*size + i] = 1.0; + } + int numcoltile = size / tilesize; + int numrowtile = size / tilesize; + for(k = 0; k < numrowtile; k++) + { + for(l = 0; l < tilesize; l++) + { + bzero(Qtemp, sizeof(double)*size*size); + for(i = 0; i < size; i++) + { + Qtemp[i*size + i] = 1.0; + } + - for(i = 0; i < l; i++) - sum += x[i] * x[i]; + for(i = k; i < numcoltile; i++) + { + bzero(w, sizeof(double)*size); + + for(j = 0 ; j < tilesize; j++) + { + w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j]; + } + w[k*tilesize+l] = 1.0; + if(k*tilesize+l > i*tilesize) + { + for(j = 0; j < k*tilesize+l; j++) + w[j] = 0.0; + } - norm = sqrt(sum); - return norm; + /* Compute (I - tau*w*w')' */ + for(j = 0; j < size; j++) + { + for(n = 0; n < size; n++) + { + if(j != n) + ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n]; + else + ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n]; + } + } + + /* Qtemp = Qtemp * (I-tau*w*w')' */ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Qtemp, size, ww, size, 0.0, temp, size); + double *b = Qtemp; + Qtemp = temp; + temp = b; + } + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size); + double *b = Q; + Q = temp; + temp = b; + } + } + + free(Qtemp); + free(w); + free(ww); + free(temp); + return Q; } -/** - * \brief Computes a Householder reflector from a pair of vectors from coupled blocks - * - * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks. - * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag - * - * \param topDiag The only non-zero element of the incoming vector in the top block - * \param ma The number of elements in the top vector - * \param xb Pointer to the lower vector - * \param l The number of elements in the whole vector - * \param vk A pointer to a pre-allocated array to store the householder vector of size l - * - * \returns void - */ -void calcvkfloat (float topDiag, - int ma, - float* xb, - int l, - float* vk) +double* getR(double*HR, int size) { - int sign, i; - float norm, div; - //same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable - - sign = topDiag >= 0.0 ? 1 : -1; - vk[0] = topDiag; - //use vk[0] as beta - for(i = 1; i < ma; i++) - vk[i] = 0; - - for(; i < l; i++) - vk[i] = xb[i - ma]; - - norm = do2norm(vk, l); - vk[0] += norm * sign; - - if(norm != 0.0) - { - div = 1/vk[0]; - - for(i = 1; i < l; i++) - vk[i] *= div; - } + double* R = malloc(sizeof(double) * size * size); + int i, j; + bzero(R, sizeof(double) * size * size); + for(i = 0; i< size; i++) + { + for(j = 0; j <= i; j++) + { + R[i*size + j] = HR[i*size + j]; + } + } + return R; } - -void updatefloatQ_WY (float* blockA, - float* blockB, - float* blockTau, - int k, int ma, int mb, int n, - int ldm, - float* hhVector)//bottom, essential part. +void printMatrix(double* Matrix, int m, int n, int tilesize) { - int i, j; - - float tau = 1.0, beta; - - /* Compute tau = 2/v'v */ - for(i = 0; i < mb; i ++) - tau += hhVector[i] * hhVector[i]; - - tau = 2/tau; + int i, j; + + for(i=0; i < m*tilesize; i++) + { + for(j = 0; j < n*tilesize; j++) + { + printf(" %.3f ", Matrix[j*m*tilesize +i]); - for(j = k; j < n; j ++) - { - /* Compute v'*b_j */ - beta = blockA[(j*ldm) + k]; + } + printf("\n"); + } - /* Then for lower half */ - for(i = 0; i < mb; i ++) - beta += blockB[(j*ldm) + i] * hhVector[i]; +} - beta *= tau; +double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize) +{ + double* TileMatrix; + TileMatrix = malloc(sizeof(double) * size ); + if(TileMatrix == NULL) + error("failed to allocate TileMatrix"); + int i,j,k,l; - /* Compute b_j = b_j - beta*v_k */ - blockA[(j*ldm) + k] -= beta; - - for(i = 0; i < mb; i ++) - blockB[(j*ldm) + i] -= beta * hhVector[i]; - } + for( i = 0; i < n ; i++ ) + { + for( j = 0; j < m; j++ ) + { + double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize]; + double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize]; + for(k = 0; k < tilesize; k++) + { + tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize]; + for( l = 0; l < tilesize; l++ ) + { + tilePos[k*tilesize + l] = tileStart[l]; + } + } + } + } - /* Insert vector below diagonal. */ - for(i = 0; i < mb; i ++) - blockB[(k*ldm) + i] = hhVector[i]; + return TileMatrix; - blockTau[k] = tau; } -void DTSQRF (float* blockA, - float* blockB, - float* blockTau, - int ma, - int mb, - int n, - int ldm, - float* hhVector) +double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize) { - int k; - float* xVectA, *xVectB; - - xVectA = blockA; - xVectB = blockB; - - for(k = 0; k < n; k++) - { - //vk = sign(x[1])||x||_2e1 + x - //vk = vk/vk[0] - calcvkfloat(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential - - //matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na) - //update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros. - updatefloatQ_WY (blockA, blockB, - blockTau, - k, ma, mb, n, - ldm, - hhVector + ma - k); - - xVectA += ldm + 1; - xVectB += ldm; - } + double* ColumnMatrix; + ColumnMatrix = (double*) malloc(sizeof(double) * size ); + if(ColumnMatrix == NULL) + error("failed to allocate ColumnMatrix"); + int i,j,k,l; + for( i = 0; i < n ; i++ ) + { + for(j = 0; j < m ; j++ ) + { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize]; + for( k = 0; k < tilesize; k++ ) + { + for(l=0; l < tilesize; l++) + { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[tilesize]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[tilesize*m]; + + } + } + } + return ColumnMatrix; } -void DSSRFT (float* blockV, - float* blockA, float* blockB, - float* blockTau, - int b, int n, int ldm) +/* Routines for the tiled QR decomposition.*/ + +/** + * + * @brief Computes the QR decomposition of a tile. + * + * @param cornerTile A pointer to the tile for which the decomposition is computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param k The value of k for the QR decomposition + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * + */ +void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w) { - int i, j, k; + int i, j, n; + double norm=0.0, sign, u1, tau, z; - float tau, beta; - /* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B, - and for each householder vector v of blockV */ + /*Find the householder vector for each row. */ + for(i = 0; i < tileSize; i++) + { + norm = 0.0; + /*Fill w with the vector.*/ + for(j=i; j < tileSize; j++) + { + /* ith row is i*tileSize, only want elements on diagonal or below.*/ + w[ j ] = cornerTile[ i*tileSize +j ]; + /*Find the norm as well */ + norm = norm + w[j]*w[j]; + } + if(w[i] >= 0.0) + sign = -1; + else + sign = 1; - /* For each column of B */ - for(j = 0; j < n; j ++) - { - /* For each householder vector. */ - for(k = 0; k < n; k ++) - { - /* tau = 2/v'v, computed earlier, stored in T(k,k). */ - tau = blockTau[k]; + norm = sqrt(norm); - /* Compute beta = v_k'b_j. */ - /* v_k is >0 (=1) only at position k in top half. */ - beta = blockA[(j*ldm) + k]; + u1 = w[i] - sign*norm; - /* For lower portion of v_k, aligning with the lower block */ - for(i = 0; i < b; i ++) - beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i]; + if(u1 != 0.0) + { + for(j=i+1; j < tileSize; j++) + w[j] = w[j] / u1; + } + else + { + for(j=i+1; j < tileSize; j++) + w[j] = 0.0; + } - beta *= tau; - - /* Compute b_j = b_j - beta * v */ - /* v_k = 1 at (k) in top half again */ - blockA[(j*ldm) + k] -= beta; - - /* Apply to bottom block. */ - for(i = 0; i < b; i ++) - blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i]; - } - } -} + if(norm != 0.0) + tau = -sign*u1/norm; + else + tau = 0.0; -float* randomMatrix(int m, int n) -{ - float* Matrix; - Matrix = (float*) malloc( sizeof(float) * m*n*32*32); - if(Matrix == NULL) - error("Failed to allocate Matrix"); - int r,c; - m = m*32; - n = n*32; -for(c = 0; c < n; c++) - { - for(r = 0; r < m; r++) - { - //CO(i,j,m) ((m * j) + i) - Matrix[(m*c)+r] = ((float)(rand() % 201) - 100.0) / 100.0; - } - } -return Matrix; + /*Store the below diagonal vector */ + + for(j = i+1; j < tileSize; j++) + { + cornerTile[ i*tileSize +j ] = w[j]; + } + cornerTile[ i*tileSize + i ] = sign*norm; + w[i] = 1.0; + /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */ + for(j = i+1; j < tileSize; j++) + { + /*Compute w'*A_j*/ + z = cornerTile[ j*tileSize+i]; + for(n = i+1; n < tileSize; n++) + { + z = z + cornerTile[j*tileSize+n] * w[n]; + } + /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ + for(n=i; n < tileSize; n++) + { + cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z; + } + + } + /* Store tau. We're on k*tileSize+ith row. kth column.*/ + tauMatrix[(k*tileSize+i)*tauNum+k] = tau; + + } } - -float* generateMatrix( int m, int n) + + +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile for which the householder is stored. + * @param rowTiles The tile to which the householder is applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * + */ +void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w) { - float* Matrix; - Matrix = (float*) malloc( sizeof(float) * m*n*32*32); - if(Matrix == NULL) - error("Failed to allocate Matrix"); - int i, j; - memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); + int i, j, n; + double z=0.0; + - for(i = 0; i < n*32; i++) + /* For each row in the corner Tile*/ + for(i = 0; i < tileSize; i++) { - for(j = 0; j < m*32; j++) + /*Get w for row i */ + for(j = i; j < tileSize; j++) { - Matrix[i*m*32 + j] = (float)(i+j); + w[j] = cornerTile[i*tileSize + j]; } + w[i] = 1.0; + + /* Apply to the row Tile */ + for(j = 0; j < tileSize; j++) + { + z=0.0; + /* Below Diagonal!*/ + /*Compute w'*A_j*/ + for(n = i; n < tileSize; n++) + { + z = z + w[n] * rowTile[j*tileSize+n]; + } + for(n = i; n < tileSize; n++) + { + rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; + } + } + + } - return Matrix; + + } -float* createIdentity(int m, int n) +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile The corner tile for this value of k. + * @param columnTile The tile for which householders are computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * + */ +void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w ) { - float* Matrix; - Matrix = (float*) malloc( sizeof(float) * m*n*32*32); - if(Matrix == NULL) - error("Failed to allocate Matrix"); - int i, j; - memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); + int i, j, n; + double norm=0.0, sign, u1, tau, z; - for(i = 0; i < n*32; i++) + /* For each column compute the householder vector. */ + for(i = 0; i < tilesize; i++) { - for(j = 0; j < m*32; j++) + norm = 0.0; + w[i] = cornerTile[ i*tilesize+i ]; + norm = norm + w[i]*w[i]; + for(j = i+1; j < tilesize; j++) + { + w[j] = 0.0; + } + for(j = 0; j < tilesize; j++) + { + w[tilesize+j] = columnTile[ i*tilesize+j ]; + norm = norm + w[tilesize+j]*w[tilesize+j]; + } + + norm = sqrt(norm); + if(w[i] >= 0.0) + sign = -1; + else + sign = 1; + + + u1 = w[i] - sign*norm; + if(u1 != 0.0) + { + for(j = i+1; j < 2*tilesize; j++){ + w[j] = w[j]/u1; + } + }else { - if(i==j) + for(j = i+1; j < 2*tilesize; j++) + w[j] = 0.0; + } + + if(norm != 0) + tau = -sign*u1/norm; + else + tau = 0.0; + + /* Apply to each row to the right.*/ + for(j = i; j < tilesize; j++) + { + /* Find w'*A_j, w is 0s except for first value with upper tile.*/ + z = 1.0 * cornerTile[ j*tilesize+i ]; + for(n = 0; n < tilesize; n++) { - Matrix[i*m*32 + j] = 1.0; - }else + z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ]; + } + /* Apply to upper tile.*/ + cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z; + for(n = i+1; n < tilesize; n++) + { + cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z; + } + /* Apply to lower tile.*/ + for(n = 0; n < tilesize; n++) { - Matrix[i*m*32 + j] = 0.0; + columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z; } + + } + /* Store w*/ + for(j = 0; j < tilesize; j++){ + columnTile[ i*tilesize+j ] = w[tilesize+j]; } + tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau; } - return Matrix; } -void printMatrix(float* Matrix, int m, int n) +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile to have the householder applied. + * @param columnTile The tile in which the householders are stored. + * @param rowTile The upper tile to have the householders applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * + */ +void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w) { - int i, j; + int i, j, n; + double z; + - for(i=0; i < m*32; i++) + for(i = 0; i < tilesize; i++) { - printf("{ "); - for(j = 0; j < n*32; j++) + for(j = 0; j < i; j++) + w[j] = 0.0; + w[i] = 1.0; + for(j = i+1; j < tilesize; j++) + w[j] = 0.0; + for(j = 0; j < tilesize; j++) + w[j+tilesize] = columnTile[i*tilesize +j]; + + /* Apply householder vector (w) to the tiles.*/ + for(j = 0; j < tilesize; j++) { - printf(" %.3f ", Matrix[j*m*32 +i]); - + z = 0.0; + /* Compute w' * A_j */ + for(n = 0; n < tilesize; n++) + { + z += w[n] * rowTile[j*tilesize+n]; + z += w[n + tilesize] * cornerTile[j*tilesize+n]; + } + for(n = 0; n < tilesize; n++) + { + rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z; + cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z; + } } - printf(" }"); - printf("\n"); } -} +} /** * @brief Computed a tiled QR factorization using QuickSched. @@ -314,10 +503,10 @@ void printMatrix(float* Matrix, int m, int n) * @param nr_threads Number of threads to use. */ -void test_qr ( int m , int n , int K , int nr_threads , int runs ) { +void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) { int k, j, i; - float *A, *A_orig, *tau; + double *A, *A_orig, *tau; struct qsched s; qsched_task_t *tid, tid_new; qsched_res_t *rid; @@ -332,28 +521,22 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { /* Decode the task data. */ int *idata = (int *)data; - int i = idata[0], j = idata[1];//, k = idata[2]; - float buff[ 2*K*K ]; + int i = idata[0], j = idata[1], k = idata[2]; + double ww[ 2*K ]; /* Decode and execute the task. */ switch ( type ) { case task_DGEQRF: - LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR , K, K , - &A[ j*m*K*K + i*K ] , m*K , &tau[ j*m*K + i*K ] , - buff , 2*K*K ); - + DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww); break; case task_DLARFT: - LAPACKE_slarft_work( LAPACK_COL_MAJOR , 'F' , 'C' , - K , K , &A[ i*m*K*K + i*K ] , - m*K , &tau[ i*m*K + i*K ] , &A[ j*m*K*K + i*K ] , - m*K ); + DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww); break; case task_DTSQRF: - //DTSQRF( &A[ j*m*K*K + j*K ] , &A[ j*m*K*K + i*K ] , &tau[ j*m*K + i*K ] , K , K , K , K*m , buff ); + DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww); break; case task_DSSRFT: - //DSSRFT( &A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m ); + DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww); break; default: error( "Unknown task type." ); @@ -363,28 +546,20 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { /* Allocate and fill the original matrix. */ - if ( ( A = (float *)malloc( sizeof(float) * m * n * K * K ) ) == NULL || - ( tau = (float *)malloc( sizeof(float) * m * n * K ) ) == NULL || - ( A_orig = (float *)malloc( sizeof(float) * m * n * K * K ) ) == NULL ) + if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL || + ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL || + ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ) error( "Failed to allocate matrices." ); - free(A_orig); -// for ( k = 0 ; k < m * n * K * K ; k++ ) - // A_orig[k] = 2*((float)rand()) / RAND_MAX - 1.0; -// A_orig = generateMatrix(m, n); - srand(5); - A_orig = randomMatrix(m,n); - printMatrix(A_orig, m, n); - printf("\n\n\n"); - memcpy( A , A_orig , sizeof(float) * m * n * K * K ); - bzero( tau , sizeof(float) * m * n * K ); - - LAPACKE_sgeqrf( LAPACK_COL_MAJOR , K*m, K*n , - A, m*K , tau); + for ( k = 0 ; k < m * n * K * K ; k++ ) + { + if(matrix == NULL) + A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0; + else + A_orig[k] = matrix[k]; + } + memcpy( A , A_orig , sizeof(double) * m * n * K * K ); + bzero( tau , sizeof(double) * m * n * K ); - printMatrix(A, m , n ); - printf("\n\n\n"); - memcpy( A , A_orig , sizeof(float) * m * n * K * K ); - bzero( tau , sizeof(float) * m * n * K ); /* Dump A_orig. */ /* message( "A_orig = [" ); for ( k = 0 ; k < m*K ; k++ ) { @@ -404,7 +579,7 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { error( "Failed to allocate tid/rid matrix." ); for ( k = 0 ; k < m * n ; k++ ) { tid[k] = qsched_task_none; - rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , NULL, 0 ); + rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none ); } /* Build the tasks. */ @@ -451,9 +626,10 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { qsched_adduse( &s , tid_new , rid[ k*m + i ] ); qsched_adduse( &s , tid_new , rid[ j*m + k ] ); qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); + qsched_addunlock( &s, tid[j*m+i-1], tid_new); if ( tid[ j*m + i ] != -1 ) qsched_addunlock( &s , tid[ j*m + i ] , tid_new ); + tid[ j*m + i ] = tid_new; } @@ -481,13 +657,7 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { tot_run += toc_run - tic; } - printf("tau = "); - for(k = 0; k < m * n * K ; k++) - { - printf("%.3f ", tau[k]); - } - printf("\n"); - printMatrix(A, m, n); + /* Dump A. */ /* message( "A = [" ); @@ -521,12 +691,60 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { for ( k = 0 ; k < qsched_timer_count ; k++ ) message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs ); + if(matrix != NULL) + { + for(k = 0; k < m*n*K*K; k++) + matrix[k] = A[k]; + } + + + /* Test if the decomposition was correct.*/ + /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); + double *Q = computeQ(tempMatrix, m*K, K, tau, m); + double *R = getR(tempMatrix, m*K); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K); + free(Q); + Q = tileToColumn(A_orig, m*n*K*K, m, n, K); + for(i = 0; i < m * n * K * K; i++) + { + if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995)) + printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]); + } + free(tempMatrix); + free(Q); + free(R);*/ + /* Clean up. */ + free(A); + free(A_orig); + free(tau); + free(tid); + free(rid); qsched_free( &s ); } +/* Generates a random matrix. */ +double* generateColumnMatrix(int size) +{ + double* matrix = malloc(sizeof(double)*size*size); + if(matrix == NULL) + error("Failed to allocate matrix"); + + unsigned long int m_z = 35532; + int i; + for(i = 0 ; i < size*size; i++) + { + m_z = (1664525*m_z + 1013904223) % 4294967296; + matrix[i] = m_z % 100; + if(matrix[i] < 0) + matrix[i] += 100; + } + return matrix; +} + + /** * @brief Main function. */ @@ -578,7 +796,7 @@ int main ( int argc , char *argv[] ) { message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , 32*M , 32*N , nr_threads , runs ); - test_qr( M , N , K , nr_threads , runs ); + test_qr( M , N , K , nr_threads , runs , NULL); } diff --git a/src/Makefile.am b/src/Makefile.am index a6771251acbfdfec5fe69035c2f15742d72a24d9..651c009cffdeb821933117f0d24230cc35aec2d7 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -21,7 +21,7 @@ AUTOMAKE_OPTIONS=gnu # Add the debug flag to the whole thing AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) -DTIMERS \ - # -fsanitize=address -fno-omit-frame-pointer + #-fsanitize=address -fno-omit-frame-pointer # Assign a "safe" version number AM_LDFLAGS = -version-info 0:0:0 @@ -31,12 +31,12 @@ lib_LTLIBRARIES = libquicksched.la libquicksched_la_SOURCES = qsched.c queue.c # List required headers -include_HEADERS = atomic.h lock.h queue.h qsched.h task.h res.h error.h +include_HEADERS = atomic.h lock.h queue.h qsched.h task.h res.h error.h qsched.h # CUDA sources if HAVE_CUDA SOURCES_CUDA = qsched.c queue.c cuda_queue.cu - CUDA_MYFLAGS = -O3 -g -DCPU_TPS=3.1e9 -lineinfo -src-in-ptx --maxrregcount=32 -Xptxas="-v" -Xptxas -dlcm=cg -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -DWITH_CUDA + CUDA_MYFLAGS = -O3 -g -DCPU_TPS=3.1e9 -Xnvlink -rdc=true -lineinfo -src-in-ptx --maxrregcount=32 -Xptxas="-v" -Xptxas -dlcm=cg -gencode arch=compute_30,code=sm_30 -ftz=true -fmad=true -DFPTYPE_SINGLE -DWITH_CUDA #-fsanitize=address -fno-omit-frame-pointer .cu: qsched.c queue.c .cu.o: $(NVCC) -c $(NVCCFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) $< -o $@ @@ -50,5 +50,5 @@ endif if HAVE_CUDA lib_LTLIBRARIES += libquicksched_cuda.la libquicksched_cuda_la_SOURCES = $(SOURCES_CUDA) -libquicksched_cuda_la_CFLAGS = -DFPTYPE_SINGLE $(AM_CFLAGS) $(CUDA_CFLAGS) +libquicksched_cuda_la_CFLAGS = -DFPTYPE_SINGLE $(AM_CFLAGS) -DWITH_CUDA $(CUDA_CFLAGS) endif diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 0b50bb1b715420f6384a6752e24a9b5620b9db78..6e35f8390e5f8f9cfa1f858e980587393b7555a8 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -24,13 +24,16 @@ #include <stdlib.h> #include <string.h> -#include "cuda_queue.h" +extern "C"{ #include "quicksched.h" +} +#include "cuda_queue.h" #include "res.h" /*Define task types required for GPU */ #define type_load -100 #define type_unload -101 +#define type_ghost -102 /*Define if conflicts are enabled or not.*/ @@ -40,19 +43,64 @@ __device__ struct queue_cuda cuda_queues[ cuda_numqueues ]; __constant__ int cuda_nrqueues; __constant__ int cuda_queue_size; +__constant__ int cuda_numtasks; __device__ struct task *tasks_cuda; __device__ qsched_res_t *locks_cuda; __device__ qsched_res_t *uses_cuda; __device__ qsched_task_t *deps_cuda; __device__ struct res *res_cuda; +__device__ char *data_cuda; +__device__ int cuda_barrier = 0; +__device__ qsched_funtype fun; /** * @brief Get a task ID from the given queue. * */ + + + +#ifdef GPU_locks +__device__ __inline__ int cuda_trylock ( volatile int *l ) { + int res = atomicCAS( (int *)l, 0 , 1 ); + return res; +} + +__device__ __inline__ int cuda_lock ( volatile int *l ) { + while(atomicCAS( (int *)l, 0 , 1 ) != 0 ); + return 0; +} + +__device__ __inline__ int cuda_unlock ( volatile int *l ) { + int res = atomicCAS( (int *)l , 1 , 0) != 1 ; + return res; +} +#endif + +/** + * @brief Copy bulk memory in a strided way. + * + * @param dest Pointer to destination memory. + * @param source Pointer to source memory. + * @param count Number of bytes to copy, must be a multiple of sizeof(int). + */ + +__device__ __inline__ void cuda_memcpy_tasks ( void *dest , void *source , int count , int tid ) { + + int k; + int *idest = (int *)dest, *isource = (int *)source; + + + /* Copy the data in chunks of sizeof(int). */ + for ( k = threadIdx.x ; k < count/sizeof(int) ; k += blockDim.x ){ + idest[k] = isource[k]; +// idest[k] *= tid; + } + + } -__device__ int cuda_queue_gettask ( struct queue_cuda *q ) { +__device__ int cuda_queue_gettask ( struct queue_cuda *q, int wait ) { int ind, tid = -1; @@ -66,7 +114,7 @@ __device__ int cuda_queue_gettask ( struct queue_cuda *q ) { /* Wrap the index. */ ind %= cuda_queue_size; /* Loop until there is a valid task at that index. */ - while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0 ); + while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0 && wait); /* Scratch the task from the queue */ if ( tid >= 0 ) @@ -105,20 +153,190 @@ __device__ void cuda_queue_puttask ( struct queue_cuda *q , int tid ) { #ifdef GPU_locks -//TODO +/** + * @brief Unlock a resource and un-hold its parents. + * + * @param s Pointer to the #qsched. + * @param rid The ID of the resource to lock. + */ + +__device__ void cuda_unlockres ( int rid ) { + + int finger; + + /* Unlock the resource. */ + cuda_unlock( &res_cuda[rid].lock ); + + /* Go back up the tree and undo the holds. */ + for ( finger = res_cuda[rid].parent ; finger != qsched_res_none ; finger = res_cuda[finger].parent ) + atomicAdd( (int *) &res_cuda[finger].hold, -1 ); + + } + + +/** + * @brief Lock a resource and hold its parents. + * + * @param rid The ID of the resource to lock. + * + * @return @c 1 if the resource could be locked, @c 0 otherwise. + */ + +__device__ int cuda_lockres ( int rid ) { + + int finger, finger2; + + /* Try to lock the root-level resource. */ + if ( res_cuda[rid].hold || cuda_trylock( &res_cuda[rid].lock ) ) + return 0; + + /* Did the resource get held in the meantime? */ + if ( res_cuda[rid].hold ) { + cuda_unlock( &res_cuda[rid].lock ); + return 0; + } + + /* Follow parents and increase their hold counter, but fail + if any are locked. */ + for ( finger = res_cuda[rid].parent ; finger != qsched_res_none ; finger = res_cuda[finger].parent ) { + if ( cuda_trylock( &res_cuda[finger].lock ) ) + break; + atomicAdd((int *) &res_cuda[finger].hold , 1); + cuda_unlock( &res_cuda[finger].lock ); + } + + /* Did we fail on the way up? */ + if ( finger != qsched_res_none ) { + + /* Unlock the resource. */ + cuda_unlock( &res_cuda[rid].lock ); + + /* Go back up the tree and undo the holds. */ + for ( finger2 = res_cuda[rid].parent ; finger2 != finger ; finger2 = res_cuda[finger2].parent ) + atomicAdd((int *) &res_cuda[finger2].hold, -1 ); + + /* Fail. */ + return 0; + + } + + /* Otherwise, all went well. */ + + else + return 1; + } + + +/** + * @brief Try to get all the locks for a task. + * + * @param tid The ID of the #task to lock. + * + * @return @c 1 if the resources could be locked, @c 0 otherwise. + */ +__device__ int cuda_locktask ( int tid ) { + + int k; + struct task *t; + /* Get a pointer on the task. */ + t = &tasks_cuda[tid]; + + /* Try to lock all the task's locks. */ + for ( k = 0 ; k < t->nr_locks ; k++ ) + if ( cuda_lockres( t->locks[k] ) == 0 ) + break; + + /* If I didn't get all the locks... */ + if ( k < t->nr_locks ) { + /* Unroll the locks I got. */ + for ( k -= 1 ; k >= 0 ; k-- ) + cuda_unlockres( t->locks[k] ); + /* Fail. */ + return 0; + } + + /* Otherwise, all went well. */ + else { + return 1; + } +} +//TODO +/** + * @brief Tell the #qsched that a task has completed. + * + * @param s Pointer to the #qsched. + * @param t Pointer to the completed #task. + */ + +__device__ void cuda_done ( struct task *t ) { + int k; + struct task *t2; + + + /* Release this task's locks. */ + for ( k = 0 ; k < t->nr_locks ; k++ ) + cuda_unlockres( t->locks[k] ); + + /* Loop over the task's unlocks... */ + //for ( k = 0 ; k < t->nr_unlocks ; k++ ) { + + /* Get a grip on the unlocked task. */ + // t2 = &tasks_cuda[ t->unlocks[k] ]; + + /* Is the unlocked task ready to run? */ + //if ( atomicAdd( &t2->wait, -1 ) == 1 && !( t2->flags & task_flag_skip ) ) + // cuda_queue_puttask( &cuda_queues[0] , t->unlocks[k] ); + + //} + + /* Set the task stats. */ + //t->toc = getticks(); + //t->cost = t->toc - t->tic; + } +/** + * @brief Get a task from the given task queue. + * + * Picks tasks from the queue sequentially and checks if they + * can be computed. If not, they are returned to the queue. + * + * This version of the routine does not check for conflicts. + * + * This routine blocks until a valid task is picked up, or the + * specified queue is empty. + */ +__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) { + int tid = -1; + + + /* Main loop. */ + while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) { + if( cuda_locktask(tid) == 1 ) + break; + cuda_queue_puttask ( q , tid ); + } + + /* Put this task into the recycling queue, if needed. */ + if ( tid >= 0 ) { + q->rec_data[ atomicAdd( (int *)&q->rec_count , 1 ) ] = tid; + } + + + /* Return whatever we got. */ + return tid; + } @@ -135,13 +353,13 @@ __device__ void cuda_queue_puttask ( struct queue_cuda *q , int tid ) { * This routine blocks until a valid task is picked up, or the * specified queue is empty. */ -__device__ int runner_cuda_gettask ( struct queue_cuda *q ) { +__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) { int tid = -1; /* Main loop. */ - while ( ( tid = cuda_queue_gettask( q ) ) >= 0 ) { + while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) { break; } @@ -159,13 +377,387 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q ) { #endif +__global__ void qsched_device_kernel ( ) +{ + volatile __shared__ int tid; + int *src, *dest; + int i; + + + /* Pull a task from the queues*/ + while ( 1 ) { + __syncthreads(); + if ( threadIdx.x == 0 ) { + tid = -1; + if( cuda_queues[0].count == cuda_queues[0].rec_count ) + tid = runner_cuda_gettask ( &cuda_queues[1], 1); + else + tid = runner_cuda_gettask ( &cuda_queues[1], 0); + + if( tid < 0 ) + tid = runner_cuda_gettask ( &cuda_queues[0], 1); + } + + /*Everyone wait for us to get a task id*/ + __syncthreads(); + + /* Exit if we didn't get a valid task. */ + if(tid < 0) + break; + + + /*Start the task clock*/ + if( threadIdx.x == 0 ) + tasks_cuda[tid].tic = clock(); + + if( tasks_cuda[tid].type == type_load ) + { + int *d = (int*)&data_cuda[tasks_cuda[tid].data]; + src = (int*)res_cuda[d[0]].data; + dest = (int*)res_cuda[d[0]].gpu_data; + cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, tid); + }else if( tasks_cuda[tid].type == type_unload ) + { + int *d = (int*)&data_cuda[tasks_cuda[tid].data]; + src = (int*)res_cuda[d[0]].gpu_data; + dest = (int*)res_cuda[d[0]].data; + cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, d[0]); + }else{ + fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]); + } + __syncthreads(); + /*Stop the task clock*/ + if( threadIdx.x == 0 ) + tasks_cuda[tid].toc = clock(); + + /*Unlocks*/ + #ifdef GPU_locks + if(threadIdx.x == 0) + cuda_done( &tasks_cuda[tid] ); + // __syncthreads(); + #endif + for(i = threadIdx.x; i < tasks_cuda[tid].nr_unlocks; i += blockDim.x ) + { + if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip )) + { + cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); + } + } + } + + +/* Make a notch on the barrier, last one out cleans up the mess... */ + __syncthreads(); + if ( threadIdx.x == 0 ) + tid = ( atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ); + __syncthreads(); + if ( tid ) { + if ( threadIdx.x == 0 ) { + cuda_barrier = 0; + volatile int *temp = cuda_queues[1].data; cuda_queues[1].data = cuda_queues[1].rec_data; cuda_queues[1].rec_data = temp; + + cuda_queues[0].first = 0; + cuda_queues[0].last = 0; + cuda_queues[0].rec_count = 0; + cuda_queues[1].first = 0; + cuda_queues[1].last = 0; + cuda_queues[1].rec_count = 0; + + } + +//TODO + /* for ( int j = threadIdx.x ; j < cuda_nr_tasks ; j+= blockDim.x ) + for ( k = 0 ; k < tasks_cuda[j].nr_unlock ; k++ ) + { + atomicAdd( (int *) &cuda_tasks[ cuda_tasks[j].unlock[k] ].wait , 1 ); + }*/ + + + } + +} +int maxVal( int *array, int size ) +{ + int i, maxi=-32000; + for (i=0; i<size; i++) + { + if (array[i]>maxi) + { + maxi=array[i]; + } + } + return maxi ; +} +int minVal( int *array, int size ) +{ + int i, maxi=3200000; + for (i=0; i<size; i++) + { + if (array[i]<maxi) + { + maxi=array[i]; + } + } + return maxi ; +} +void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted ) +{ + int i,j; + if(numChildren > 0 && size/numChildren > 128*sizeof(int)) + { + /* Create dummy task for this resource and recurse to its children!*/ + int task, utask; + task = qsched_addtask( s, type_ghost, task_flag_none, NULL, 0 , 0 ); + s->res[ID].task = task; + utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 ); + s->res[ID].task = utask; + if(parent >= 0) + { + /* Create dependecy to parent. */ + qsched_addunlock(s, task, s->res[parent].task ); + qsched_addunlock(s, s->res[parent].utask, utask ); + } + for(i = sorted[ID]; i < sorted[ID+1]; i++) + { + qsched_create_loads(s, i, s->res[res[i]].size, sorted[i+1]-sorted[i], ID, res, sorted); + } + }else{ + int task,utask; + task = qsched_addtask( s , type_load , task_flag_none, &ID, sizeof(int), 0 ); + s->res[ID].task = task; + utask = qsched_addtask( s , type_unload, task_flag_none, &ID, sizeof(int), 0 ); + s->res[ID].task = utask; + /* Create load task for this resource and set its children to completed with this task.*/ + for( j = sorted[ID]; j < sorted[ID+1]; j++ ) + { + s->res[res[j]].task = task; + } + /* If it has a parent then set the parents ghost task to be dependent on this.*/ + if(parent >= 0) + { + qsched_addunlock(s, task, s->res[parent].task ); + qsched_addunlock(s, s->res[parent].utask, utask); + } + } +} void qsched_prepare_loads ( struct qsched *s ) { -int i, task, unload, tb, j, k, unlocked=0; +int i, task, unload, j , k , unlocked = 0; +struct task *t; +int *sorted, lastindex; +int *res, *res_data; + +if(s->res[0].task != -1) +{ + printf("Tasks already initialised, not redoing load/unload tasks"); + return; +} + +/* Store number of children for each resource*/ +sorted = (int*) malloc(sizeof(int) * (s->count_res+1)); +res = (int*) malloc(sizeof(int) * s->count_res); +res_data = (int*) malloc(sizeof(int)* s->count_res); +memset( sorted, 0, sizeof(int) * (s->count_res+1)); +/* Count the children for each parent. Stored in sorted[parent +1]*/ +for(i = 0; i < s->count_res; i++) +{ + if(s->res[i].parent != qsched_res_none) + { + sorted[s->res[i].parent+1]++; + } +} + +/* Run through the sorted array and turn count to indices. Id 0 starts at 0, Id 1 starts at 0+sorted[1] (number of children for resource 0)*/ +sorted[0] = 0; +for(i = 1; i < s->count_res+1; i++ ) +{ + sorted[i] = sorted[i] + sorted[i-1]; +} +lastindex = sorted[s->count_res]; + +for(i = 0; i < s->count_res; i++) +{ + int parent = s->res[i].parent; + if( parent == qsched_res_none ) + { + res[lastindex] = i; + lastindex++; + }else{ + res[sorted[parent]] = i; + sorted[parent]++; + } +} + +/*Initialise the memory address array, already sorted by parents.*/ +int mini=0; +for(i = 0; i < s->count_res; i++) +{ + res_data[i] = ((char *)s->res[res[i]].data) - (char*)s->res[0].data; + if(res_data[i] < mini) + mini = res_data[i]; +// printf("%i ", res_data[i]); +} +for(i = 0; i < s->count_res; i++) +{ + res_data[i] -= mini; +printf("%i ", res_data[i]); +} +printf("\n"); + +/* Sort the children of each parent by memory address. */ +qsched_sort(res, res_data, sorted[0], minVal(res_data, sorted[0]), maxVal(res_data, sorted[0])); +for(i = 1; i < s->count_res; i++) +{ + if(sorted[i] > sorted[i-1]){ + qsched_sort(&res[sorted[i-1]], &res_data[sorted[i-1]], sorted[i]-sorted[i-1], minVal(&res_data[sorted[i-1]], sorted[i]-sorted[i-1]), maxVal(&res_data[sorted[i-1]], sorted[i]-sorted[i-1])); + } +} + +/* Sort super resources by memory address.*/ +qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1])); + + +/*for(i = 0; i < s->count_res; i++) +{ + printf("%i ", res_data[i]); +} +printf("\n"); + + +for(i = 0; i < s->count_res; i++) +{ +printf("%i ", res[i]); +} +printf("\n"); + +for(i = 0; i < s->count_res; i++) +{ + printf("%i ", s->res[res[i]].parent); +} +printf("\n");*/ +/*res now contains an array of indices, first sorted by parent, then memory address of data. */ +int size=0; +if(sorted[0] != 0) +{ + /* Check no overlapping resources.*/ + for(i = 0; i < sorted[0]-1; i++) + { + if(res_data[i] + s->res[res[i]].size > res_data[i+1]) + error("Overlapping resources are not allowed."); + } +} + +for(i = 1; i < s->count_res; i++) +{ + if(sorted[i] > sorted[i-1]) + { + for(j = sorted[i-1]; j < sorted[i]-1; j++) + { + if(res_data[j] + s->res[res[j]].size > res_data[j+1]) + error("Overlapping resources are not allowed."); + } + } +} + +/* Check super resources don't overlap.*/ +for( i = sorted[s->count_res-1]; i < s->count_res; i++ ) +{ + if(res_data[i] + s->res[res[i]].size > res_data[i+1]) + error("Overlapping resources are not allowed."); +} + +/* Reposition sorted pointers so that sorted[i] points to the first child of task ID= i*/ +for(i = sorted[s->count_res]; i >= 0; i-- ) +{ + sorted[i] = sorted[i-1]; +} +/* If nothing overlaps create tasks.*/ +for( i = sorted[s->count_res]; i < s->count_res; i++ ) +{ + /* Start from each parentless task.*/ + int ID = res[i]; + int size = s->res[ID].size; + int numChildren = sorted[ID+1] - sorted[ID]; + int parent = -1; + + qsched_create_loads(s, ID, size, numChildren, parent, res, sorted); + +} + +/* Check all tasks have load tasks - if not give parents (recursively)*/ +for(i = 0; i < s->count_res; i++ ) +{ + if( s->res[i].task == -1 ) + { + struct res *t; + for(t = &s->res[s->res[i].parent]; s->res[i].task == -1; t = &s->res[t->parent]) + { + s->res[i].task = t->task; + if((t->parent == qsched_res_none) && s->res[i].task == -1) + error("Somehow load task wasn't initialised"); + } + } + + if( s->res[i].utask == -1 ) + { + struct res *t; + for(t = &s->res[s->res[i].parent]; s->res[i].utask == -1; t = &s->res[t->parent]) + { + s->res[i].utask = t->utask; + if((t->parent == qsched_res_none) && s->res[i].utask == -1) + error("Somehow unload task wasn't initialised"); + } + } +} + + + +/* Set up dependencies with the rest of the system.*/ +for( i = 0; i < s->count_res; i++ ) +{ + int unlocked = 0; + for( j = 0; j < s->count; j++ ) + { + struct task *t = &s->tasks[j]; + for(k = 0; k < t->nr_uses; k++) + { + if(t->uses[k] == i) + { + qsched_addunlock(s, s->res[i].task ,j); + qsched_addunlock(s, j, s->res[i].utask); + unlocked = 1; + break; + } + } + if(unlocked == 1) + { + unlocked = 0; + continue; + } + for(k = 0; k < t->nr_locks; k++) + { + if(t->locks[k] == i) + { + qsched_addunlock(s, s->res[i].task ,j); + qsched_addunlock(s, j, s->res[i].utask); + break; + } + } + } +} +//error("Got to here"); +} + + + + +void qsched_prepare_loads_old ( struct qsched *s ) { + +int i, task, unload, j, k, unlocked=0; struct task *t; /* Create a load task for each resource. */ for(i = 0; i < s->count_res; i++) @@ -173,17 +765,19 @@ for(i = 0; i < s->count_res; i++) if(s->res[i].size == 0) continue; - - - - - task = qsched_addtask( s , type_load , 0 , &i , sizeof(int) , 0 ); - unload = qsched_addtask( s , type_unload, 0 , &i, sizeof(int), 0 ); + if(s->res[i].task >= 0) + continue; +// cudaMalloc( &s->res[ i ].gpu_data, s->res[i].size ); + task = qsched_addtask( s , type_load , task_flag_none , &i , sizeof(int) , 0 ); + s->res[i].task = task; + printf("s->res[i].task = %i\n", s->res[i].task); + unload = qsched_addtask( s , type_unload, task_flag_none , &i, sizeof(int), 0 ); /*Load task unlocks each task that uses or locks the specified resource */ + /*Unload task is unlocked by each task that is unlocked by the load task. */ for(j = 0; j < s->count; j++) { t = &s->tasks[j]; - + for(k = 0; k < t->nr_uses; k++) { if(t->uses[k] == i){ @@ -208,8 +802,8 @@ for(i = 0; i < s->count_res; i++) } } - - + // qsched_adduse(s, task , i); + // qsched_adduse(s, unload , i); } @@ -221,97 +815,405 @@ for(i = 0; i < s->count_res; i++) -int qsched_prepare_cuda ( struct qsched *s ) { + +extern "C" void qsched_prepare_cuda ( struct qsched *s ) { int i; -struct task *cuda_t; -qsched_res_t *setup; +int j, k, count; +struct task *t, *tasks; +struct task *cuda_t, *task, *temp; +qsched_res_t *setup_l, *setup_u; qsched_task_t *setup_t; struct res *res_t; int *data; +char *sdata; -/*Setup Load and unload tasks*/ +/* Lock the sched. */ + //lock_lock( &s->lock ); + /* Get a pointer to the tasks, set the count. */ + tasks = s->tasks; + count = s->count; + + /* If the sched is dirty... */ + if ( s->flags & qsched_flag_dirty ) { + + /* Do the sorts in parallel, if possible. */ + #pragma omp parallel + { + + /* Sort the unlocks. */ + #pragma omp single nowait + qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 ); + + /* Sort the locks. */ + #pragma omp single nowait + qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); + + /* Sort the uses. */ + #pragma omp single nowait + qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 ); + + } + + /* Run throught the tasks and link the locks and unlocks. */ + tasks[0].unlocks = s->deps; + tasks[0].locks = s->locks; + tasks[0].uses = s->uses; + for ( k = 1 ; k < count ; k++ ) { + tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ]; + tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ]; + tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ]; + } + + /* All cleaned-up now! */ + //s->flags &= ~qsched_flag_dirty; + + } +qsched_prepare_loads(s); + + + + /* Get a pointer to the tasks, set the count. */ + tasks = s->tasks; + count = s->count; + + /* If the sched is dirty... */ + if ( s->flags & qsched_flag_dirty ) { + + /* Do the sorts in parallel, if possible. */ + #pragma omp parallel + { + + /* Sort the unlocks. */ + #pragma omp single nowait + qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 ); + + /* Sort the locks. */ + #pragma omp single nowait + qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); + + /* Sort the uses. */ + #pragma omp single nowait + qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 ); + + } + + /* Run throught the tasks and link the locks and unlocks. */ + tasks[0].unlocks = s->deps; + tasks[0].locks = s->locks; + tasks[0].uses = s->uses; + for ( k = 1 ; k < count ; k++ ) { + tasks[k].unlocks = &tasks[k-1].unlocks[ tasks[k-1].nr_unlocks ]; + tasks[k].locks = &tasks[k-1].locks[ tasks[k-1].nr_locks ]; + tasks[k].uses = &tasks[k-1].uses[ tasks[k-1].nr_uses ]; + } + + /* All cleaned-up now! */ + s->flags &= ~qsched_flag_dirty; + + } + + /* Init the queues. */ + for ( k = 0 ; k < s->nr_queues ; k++ ) + queue_init( &s->queues[k] , count ); + /* Reset the waits to 0... */ + for( k = 0; k < count; k++ ) + { + tasks[k].wait = 0; + } + + /* Run through the tasks and set the waits... */ + for ( k = 0 ; k < count ; k++ ) { + t = &tasks[k]; + if ( !( t->flags & task_flag_skip ) ) + for ( j = 0 ; j < t->nr_unlocks ; j++ ) + tasks[ t->unlocks[j] ].wait += 1; + } + + /* Sort the tasks topologically. */ + int *tid = (int *)malloc( sizeof(int) * count ); + for ( j = 0 , k = 0 ; k < count ; k++ ) + if ( tasks[k].wait == 0 ) { + tid[j] = k; + j += 1; + } + for ( k = 0 ; k < j ; k++ ) { + t = &tasks[ tid[k] ]; + for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) + if ( ( tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { + tid[j] = t->unlocks[kk]; + j += 1; + } + } + if ( k < count ) + error( "Circular dependencies detected." ); + + /* Run through the topologically sorted tasks backwards and + set their weights, re-setting the waits while we're at it. */ + for ( k = count-1 ; k >= 0 ; k-- ) { + int maxweight = 0; + t = &tasks[ tid[k] ]; + for ( j = 0 ; j < t->nr_unlocks ; j++ ) { + tasks[ t->unlocks[j] ].wait += 1; + if ( tasks[ t->unlocks[j] ].weight > maxweight ) + maxweight = tasks[ t->unlocks[j] ].weight; + } + t->weight = t->cost + maxweight; + } +/*Allocate temporary tasks to setup device tasks*/ +temp = (struct task *) malloc(s->count * sizeof(struct task)); +if(temp == NULL) + error("Failed to allocate the temporary task store."); +memcpy(temp, s->tasks, s->count * sizeof(struct task)); /*Copy the qsched data to the device*/ -if( cudaMalloc( &cuda_t , sizeof(struct task) * s->count ) != cudaSuccess ) - error("Failed to allocate task array on the device."); +if(cudaMalloc( &sdata , s->count_data ) != cudaSuccess ) + error("Failed to allocate the qsched data on the device"); +if(cudaMemcpy( sdata , s->data , s->count_data , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy the qsched data to the device."); +if(cudaMemcpyToSymbol( data_cuda , &sdata , sizeof(char *) , 0 ,cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy data pointer to the device."); -if( cudaMemcpy( cuda_t, s->tasks, sizeof(struct task) * s->count , cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy tasks to the device."); -if( cudaMemcpyToSymbol ( cuda_t , tasks_cuda, sizeof(struct task *) , cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy task pointer to the device."); -if( cudaMalloc( &setup , sizeof(qsched_res_t) * s->count_locks ) != cudaSuccess ) +/*Copy the task data to the device*/ + +if(s->count_locks > 0 ) +{ +if( cudaMalloc( &setup_l , sizeof(qsched_res_t) * s->count_locks ) != cudaSuccess ) error("Failed to allocate locks array on the device."); -if( cudaMemcpy( &setup , s->locks , sizeof(qsched_res_t) * s->count_locks, cudaMemcpyHostToDevice ) != cudaSuccess ) +if( cudaMemcpy( setup_l , s->locks , sizeof(qsched_res_t) * s->count_locks, cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy locks to the device."); -if( cudaMemcpyToSymbol ( setup , locks_cuda, sizeof(qsched_res_t *), cudaMemcpyHostToDevice ) != cudaSuccess ) +if( cudaMemcpyToSymbol ( locks_cuda, &setup_l , sizeof(qsched_res_t *), 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy locks pointer to the device."); +for(i = 0; i < s->count; i++ ) +{ + task = &temp[i]; + task->locks = setup_l + (task->locks - s->locks); +} +} if( cudaMalloc( &setup_t , sizeof(qsched_task_t) * s->count_deps ) != cudaSuccess ) - error("Failed to allocate deps array on the device."); -if( cudaMemcpy( &setup_t , s->deps , sizeof(qsched_task_t) * s->count_deps, cudaMemcpyHostToDevice ) != cudaSuccess ) - error("Failed to copy deps to the device."); -if( cudaMemcpyToSymbol ( &setup_t , deps_cuda, sizeof(qsched_task_t *) , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to allocate deps array on the device: %s", cudaGetErrorString(cudaPeekAtLastError())); +if( cudaMemcpy( setup_t , s->deps , sizeof(qsched_task_t) * s->count_deps, cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy deps to the device: %s", cudaGetErrorString(cudaPeekAtLastError())); +if( cudaMemcpyToSymbol ( deps_cuda, &setup_t , sizeof(qsched_task_t *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy deps pointer to the device."); -if( cudaMalloc( &uses_cuda , sizeof(qsched_res_t) * s->count_uses ) != cudaSuccess ) + + +if( cudaMalloc( &setup_u , sizeof(qsched_res_t) * s->count_uses ) != cudaSuccess ) error("Failed to allocate use array on the device."); -if( cudaMemcpy( &setup , s->uses , sizeof(qsched_res_t) * s->count_uses, cudaMemcpyHostToDevice ) != cudaSuccess ) +if( cudaMemcpy( setup_u , s->uses , sizeof(qsched_res_t) * s->count_uses, cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy locks to the device."); -if( cudaMemcpyToSymbol ( setup , uses_cuda, sizeof(qsched_res_t *), cudaMemcpyHostToDevice ) != cudaSuccess ) +if( cudaMemcpyToSymbol ( uses_cuda, &setup_u , sizeof(qsched_res_t *), 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy locks pointer to the device."); + if( cudaMalloc( &res_t , sizeof(struct res) * s->count_res ) != cudaSuccess ) error("Failed to allocated on the device."); for(i = 0; i < s->count_res; i++) { - if(s->res[i].size == 0) - continue; - if( cudaMalloc( &data, sizeof(int) * s->res[i].size) != cudaSuccess ) - error("Failed to allocate data space on the device."); - s->res[i].gpu_data = data; + // if(s->res[i].size == 0) + // continue; + //if( cudaMalloc( &data, s->res[i].size) != cudaSuccess ) + // error("Failed to allocate data space on the device."); + //s->res[i].gpu_data = data; } -if( cudaMemcpy( &res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess ) - error("Failed to copy resources to the device."); -if( cudaMemcpyToSymbol( res_t , res_cuda , sizeof(struct res *) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess ) - error("Failed to copy res pointer to the device."); - - +if( cudaMemcpy( res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy resources to the device: %s", cudaGetErrorString(cudaPeekAtLastError())); +if( cudaMemcpyToSymbol( res_cuda , &res_t , sizeof(struct res *) , 0 , cudaMemcpyHostToDevice) != cudaSuccess ) + error("Failed to copy res pointer to the device: %s", cudaGetErrorString(cudaPeekAtLastError())); +for(i = 0; i < s->count; i++ ) +{ + task = &temp[i]; + task->unlocks = setup_t + (task->unlocks - s->deps); +} +for(i = 0; i < s->count; i++ ) +{ + task = &temp[i]; + task->uses = setup_u + (task->uses - s->uses); +} +if( cudaMalloc( &cuda_t , sizeof(struct task) * s->count ) != cudaSuccess ) + error("Failed to allocate task array on the device."); +if( cudaMemcpy( cuda_t, temp, sizeof(struct task) * s->count , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy tasks to the device."); +if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t , sizeof(struct task *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy task pointer to the device."); +/*for(i = 0; i < s->count; i++) +{ + printf("Task:%i of type %i has wait %i\n", i, temp[i].type, temp[i].wait); +}*/ /* Initialize the queues. */ -int nr_queues= 2, k, qsize; +int nr_queues= 2,qsize; //int *data; struct queue_cuda queues[ cuda_numqueues ]; + qsize = max(2*s->count / nr_queues, 256); + if ( cudaMemcpyToSymbol( cuda_queue_size , &qsize , sizeof(int) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue size to the device."); + + /* Allocate a temporary buffer for the queue data. */ + if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL ) + error("Failed to allocate data buffer."); + + queues[1].count = 0; + for(i = 0; i < s->count; i++) + { + if(temp[i].type == type_load && temp[i].wait == 0) + data[queues[1].count++] = i; + + } + for ( k = queues[1].count ; k < qsize ; k++ ) + data[k] = -1; + /* Allocate and copy the data. */ + if ( cudaMalloc( &queues[1].data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[1].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + + for ( k = 0; k < qsize; k++ ) + data[k] = -1; + + /* Allocate and copy the recyling data. */ + if ( cudaMalloc( &queues[1].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[1].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + /* Allocate and copy the recyling data. */ + if ( cudaMalloc( &queues[0].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + + /* Set some other values. */ + queues[1].first = 0; + queues[1].last = queues[1].count; + queues[1].rec_count = 0; + + /* Init queue 0*/ + queues[0].count = 0; + for ( k = 0; k < s->count ; k++ ) + { + if(temp[k].type != type_load && temp[k].wait == 0){ + data[queues[0].count++] = k; + } + + } + queues[0].first = 0; + queues[0].last = queues[0].count; + queues[0].count = s->count - queues[1].count; + queues[0].rec_count = 0; + + + /* Allocate and copy the data. */ + if ( cudaMalloc( &queues[0].data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[0].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + + /* Copy the queue structures to the device. */ + if ( cudaMemcpyToSymbol( cuda_queues , &queues , sizeof(struct queue_cuda) * nr_queues , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy the queues to the device"); + /* Clean up. */ + free( tid ); + printf("Queue[0].count = %i , queue[1].count = %i\n", queues[0].count, queues[1].count); + + /* Set the number of waiting tasks. */ + s->waiting = count; + + /* Set the ready flag. */ + s->flags |= qsched_flag_ready; + + free( temp ); + free( data ); + /* Unlock the sched. */ + //lock_unlock_blind( &s->lock ); + +} +struct task* qsched_get_timers( struct qsched *s, int numtasks ) +{ + struct task *gpu_tasks = NULL, *cuda_tasks; + + cuda_tasks = (struct task*)malloc(numtasks * sizeof(struct task)); + if(cuda_tasks == NULL) + error("Failed to allocate cuda_tasks"); + + if( cudaMemcpyFromSymbol(&gpu_tasks, tasks_cuda, sizeof(struct task*), 0 ) != cudaSuccess ) + error("Failed to get symbol from device"); + + if( cudaMemcpy( cuda_tasks, gpu_tasks, sizeof(struct task) * numtasks, cudaMemcpyDeviceToHost) != cudaSuccess ) + error("Failed to copy tasks from device to host"); + return cuda_tasks; } +/*void qsched_get_CUDA_tasks ( struct qsched *s , struct task *cuda_tasks , int numtasks ) +{ + +}*/ + +/** + * @brief Execute all the tasks in the current scheduler using + * OpenMP. + * + * @param s Pointer to the #qsched. + * @param fun User-supplied function that will be called with the + * task type and a pointer to the task data. This must be a __device__ function! + * + * This function is only available if QuickSched was compiled with + * OpenMP support. + */ +void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) { + +#ifdef WITH_CUDA + double itpms = 1000.0 / CPU_TPS; + ticks tic, toc_run ; + tic = getticks(); + qsched_prepare_cuda( s ); + toc_run = getticks(); + message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype)); + tic = getticks(); + qsched_device_kernel<<<1, 128 >>> ( ); + if( cudaDeviceSynchronize() != cudaSuccess ) + error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError())); + toc_run = getticks(); + message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + + // TODO Unload timers (optional). + +#else + error("QuickSched was not compiled with CUDA support."); + +#endif + +} diff --git a/src/cuda_queue.h b/src/cuda_queue.h index a546786ba406fc17ef584e50c4fe44ae19f8737a..27f41c5f32674b8bdbc1c6249b44ccb60b39e161 100644 --- a/src/cuda_queue.h +++ b/src/cuda_queue.h @@ -16,9 +16,10 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ -#define cuda_max_unlock 128 #define cuda_numqueues 2 +/** Type definition for the execution function in #qsched_run. */ +typedef void (*qsched_funtype)( int , void * ); /** Struct for a task queue. */ struct queue_cuda { @@ -39,3 +40,8 @@ struct queue_cuda { volatile int *rec_data; }; + +/*Define function prototypes*/ +void qsched_run_CUDA ( struct qsched *s, qsched_funtype fun ); +extern "C" void qsched_prepare_cuda ( struct qsched *s ); +struct task* qsched_get_timers( struct qsched *s, int numtasks ); diff --git a/src/lock.h b/src/lock.h index f814a09e378b570fc606ff24535cc21adb60a5c3..5c14abfe76e7bac02f6e15512e7276b01ccd5fe3 100644 --- a/src/lock.h +++ b/src/lock.h @@ -27,28 +27,11 @@ #endif #ifdef WITH_CUDA -#undef INLINE +#define static #endif - -#ifdef WITH_CUDA - #define lock_type volatile int - #define lock_init( l ) ( *l = 0 ) - #define lock_destroy( l ) 0 - __device__ inline static int lock_lock ( volatile int *l ) { - while( atomicCAS( (int *)l , 0 , 1 ) != 0 ); - return 0; - } - __device__ inline static int lock_trylock ( volatile int *l ) { - int res = atomicCAS( (int *)l , 0 , 1 ); - return res; - } - __device__ inline static int lock_unlock ( volatile int *l ) { - int res = atomicCAS ( (int *)l , 1 , 0 ) != 1; - return res; - } - #define lock_unlock_blind( l ) atomicCAS( &l , 1 , 0 ) -#elif defined (PTHREAD_LOCK) +#ifdef PTHREAD_LOCK +#ifndef WITH_CUDA #define lock_type pthread_spinlock_t #define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 ) #define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 ) @@ -69,3 +52,20 @@ #define lock_unlock( l ) ( __sync_val_compare_and_swap( l , 1 , 0 ) != 1 ) #define lock_unlock_blind( l ) __sync_val_compare_and_swap( l , 1 , 0 ) #endif +#else + #define lock_type volatile int + #define lock_init( l ) ( *l = 0 ) + #define lock_destroy( l ) 0 + INLINE static int lock_lock ( volatile int *l ) { + while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 ); + // while( *l ); + return 0; + } + #define lock_trylock( l ) ( ( *(l) ) ? 1 : __sync_val_compare_and_swap( l , 0 , 1 ) ) + #define lock_unlock( l ) ( __sync_val_compare_and_swap( l , 1 , 0 ) != 1 ) + #define lock_unlock_blind( l ) __sync_val_compare_and_swap( l , 1 , 0 ) +#endif + +#ifdef WITH_CUDA +#undef static +#endif diff --git a/src/qsched.c b/src/qsched.c index 5bcf1487d8b321ae308f5aa46254e8fdbb9902b6..cbc148424029ca39014c752194fc8a41d766d1dc 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -276,6 +276,8 @@ void qsched_reset ( struct qsched *s ) { } + + /** * @brief Execute all the tasks in the current scheduler using * OpenMP. @@ -698,7 +700,7 @@ void qsched_unlocktask ( struct qsched *s , int tid ) { * will require the #qsched to be re-prepared. */ -struct task *qsched_gettask ( struct qsched *s , int qid ) { +struct task* qsched_gettask ( struct qsched *s , int qid ) { int naq, k, tid, qids[ s->nr_queues ]; struct task *t; @@ -785,6 +787,70 @@ struct task *qsched_gettask ( struct qsched *s , int qid ) { } +/** + * @brief Sort the data according to the given indices. + * + * @param data The data to be sorted + * @param ind The indices with respect to which the data are sorted. + * @param N The number of entries + * @param min Lowest index. + * @param max highest index. + * + * This function calls qsched_quicksort. + */ + +void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) { + int *new_data; + int *new_ind; + int i; + if(N <= 0) + return; + new_data = (int*)malloc(sizeof(int) * N); + if(new_data == NULL) + error("Failed to allocate new_data"); + new_ind = (int*)malloc(sizeof(int) * N); + if(new_ind == NULL) + error("Failed to allocate new_ind"); + + /*Create buckets of size ? - Ideally <16 elements per bucket. Use max-min / N * 10 ? Should give average of 10 elements per bucket */ + int bucketsize = 1; + + /* To find bucket do ind-min / b and it goes in that bucket.*/ + int num_buckets = (max-min) / bucketsize +1; + int *bucket_inds = (int*) malloc(sizeof(int) * num_bucket); + if(bucket_inds == NULL) + error("Failed to allocate bucket_inds"); + memset(bucket_inds,0, sizeof(int)*num_buckets); + for(i = 0; i < N; i++) + { + bucket_inds[(ind[i]-min)]++; + } + for(i = 1; i < num_buckets; i++ ) + { + bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1]; + } + /* bucket_inds[i] contains the starting position for the i+1th bucket*/ + for(i = num_buckets-1; i >0; i--) + { + bucket_inds[i] = bucket_inds[i-1]; + } + bucket_inds[0] = 0; + + for(i = 0; i < N; i++) + { + int z = (ind[i]-min); + new_data[bucket_inds[z]] = data[i]; + new_ind[bucket_inds[z]++] = ind[i]; + } + + /* Copy data back to data and ind and deallocate everything!*/ + memcpy(data, new_data, N*sizeof(int)); + memcpy(ind, new_ind, N*sizeof(int)); + free(new_data); + free(new_ind); + free(bucket_inds); + +} /** * @brief Sort the data according to the given indices. @@ -798,7 +864,7 @@ struct task *qsched_gettask ( struct qsched *s , int qid ) { * This function calls itself recursively. */ -void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , int max ) { +void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min , int max ) { int pivot = (min + max) / 2; int i = 0, j = N-1; @@ -806,7 +872,7 @@ void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , in /* If N is small enough, just do insert sort. */ if ( N < 16 ) { - + printf("%i\n", N); for ( i = 1 ; i < N ; i++ ) if ( ind[i] < ind[i-1] ) { temp_i = ind[i]; @@ -842,13 +908,13 @@ void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , in /* Recurse on the left? */ if ( j > 0 && pivot > min ) { #pragma omp task untied - qsched_sort( data , ind , j+1 , min , pivot ); + qsched_quicksort( data , ind , j+1 , min , pivot ); } /* Recurse on the right? */ if ( i < N && pivot+1 < max ) { #pragma omp task untied - qsched_sort( &data[i], &ind[i], N-i , pivot+1 , max ); + qsched_quicksort( &data[i], &ind[i], N-i , pivot+1 , max ); } } @@ -856,11 +922,11 @@ void qsched_sort ( int *restrict data , int *restrict ind , int N , int min , in /* Recurse on the left? */ if ( j > 0 && pivot > min ) - qsched_sort( data , ind , j+1 , min , pivot ); + qsched_quicksort( data , ind , j+1 , min , pivot ); /* Recurse on the right? */ if ( i < N && pivot+1 < max ) - qsched_sort( &data[i], &ind[i], N-i , pivot+1 , max ); + qsched_quicksort( &data[i], &ind[i], N-i , pivot+1 , max ); } @@ -885,9 +951,6 @@ void qsched_prepare ( struct qsched *s ) { /* Lock the sched. */ lock_lock( &s->lock ); - #ifdef WITH_CUDA - qsched_prepare_loads( s ); - #endif /* Get a pointer to the tasks, set the count. */ tasks = s->tasks; @@ -1005,9 +1068,11 @@ void qsched_prepare ( struct qsched *s ) { * @param size Size of the data in bytes. * @return The ID of the new shared resource. */ - +#ifdef WITH_CUDA +int qsched_addres ( struct qsched *s , int owner , int parent, void* data, int size , void* gpu_data) { +#else int qsched_addres ( struct qsched *s , int owner , int parent, void* data, int size ) { - +#endif struct res *res_new; int id; @@ -1047,10 +1112,26 @@ int qsched_addres ( struct qsched *s , int owner , int parent, void* data, int s s->res[ id ].data = data; s->res[ id ].size = size; + /* Resources data must be contained by parent*/ + if( s->res[ id ].parent != qsched_res_none && data != NULL) + { + char* parentdata = (char*)s->res[ parent ].data; + char* childdata = (char*) data; + int pos = childdata-parentdata; + if(pos < 0 || pos + size > s->res[parent].size) + { + error("Data for a child resource must be contained in the data of its parent"); + } + } + #ifdef WITH_CUDA - cudaMalloc( &s->res[ id ].cuda_data, size ); + s->res[ id ].gpu_data = gpu_data; + s->res[ id ].task = -1; + s->res[ id ].utask = -1; #endif + + /* Unlock the sched. */ lock_unlock_blind( &s->lock ); diff --git a/src/qsched.h b/src/qsched.h index a2d18d3818d074a5e74c9c640a85e0503f6fee7e..dfa9cde5f36d53704ee07f0419f082151bb79d43 100644 --- a/src/qsched.h +++ b/src/qsched.h @@ -177,7 +177,11 @@ void qsched_enqueue ( struct qsched *s , struct task *t ); /* External functions. */ void qsched_init ( struct qsched *s , int nr_queues , int flags ); +#ifdef WITH_CUDA +qsched_res_t qsched_addres ( struct qsched *s , int owner , qsched_res_t parent , void* data, int size, void* gpu_data); +#else qsched_res_t qsched_addres ( struct qsched *s , int owner , qsched_res_t parent , void* data, int size); +#endif void qsched_addlock ( struct qsched *s , qsched_task_t t , qsched_res_t res ); void qsched_addunlock ( struct qsched *s , qsched_task_t ta , qsched_task_t tb ); qsched_task_t qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *data , int data_size , int cost ); diff --git a/src/res.h b/src/res.h index bb2b3043eb974d09193a6dd06b153b111bfe198c..8418b6fa1d89118ae2e079d57d02887087d98adb 100644 --- a/src/res.h +++ b/src/res.h @@ -16,7 +16,9 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ - +#ifdef WITH_CUDA +#define res_no_task -2 +#endif /* The res data structure. */ struct res { @@ -36,12 +38,17 @@ struct res { /* Pointer to data on the CPU */ void* data; - /*Size of the data associated with this resource */ + /*Size of the data (in bytes) associated with this resource */ int size; #ifdef WITH_CUDA /* Pointer to the data on the GPU */ void* gpu_data; + + /* Index of the load task for this resource, if required.*/ + int task; + /* Index of the unload task for this resouce, if required.*/ + int utask; #endif };