diff --git a/autom4te.cache/requests b/autom4te.cache/requests index 6cd5e4a3a44e4986e168bef0c16985538948cd38..5f6702525b223f35d76288fa655f62b87988382b 100644 --- a/autom4te.cache/requests +++ b/autom4te.cache/requests @@ -68,11 +68,11 @@ '_LT_AC_LANG_C_CONFIG' => 1, 'AM_PROG_INSTALL_STRIP' => 1, '_m4_warn' => 1, - 'AC_LIBTOOL_OBJDIR' => 1, 'AX_GCC_X86_CPUID' => 1, + 'AC_LIBTOOL_OBJDIR' => 1, 'gl_FUNC_ARGZ' => 1, - 'AM_SANITY_CHECK' => 1, 'LTOBSOLETE_VERSION' => 1, + 'AM_SANITY_CHECK' => 1, 'AC_LIBTOOL_LANG_GCJ_CONFIG' => 1, 'AC_LIBTOOL_PROG_COMPILER_PIC' => 1, 'LT_LIB_M' => 1, @@ -85,21 +85,21 @@ 'AC_LIBTOOL_GCJ' => 1, 'DX_CLEAR_DEPEND' => 1, '_LT_WITH_SYSROOT' => 1, - 'LT_FUNC_DLSYM_USCORE' => 1, 'LT_SYS_DLOPEN_DEPLIBS' => 1, - '_LT_AC_LANG_F77' => 1, + 'LT_FUNC_DLSYM_USCORE' => 1, 'AC_LIBTOOL_CONFIG' => 1, - '_AM_SUBST_NOTMAKE' => 1, + '_LT_AC_LANG_F77' => 1, 'AC_LTDL_DLLIB' => 1, + '_AM_SUBST_NOTMAKE' => 1, '_AM_AUTOCONF_VERSION' => 1, 'AM_DISABLE_SHARED' => 1, '_LT_PROG_ECHO_BACKSLASH' => 1, '_LTDL_SETUP' => 1, - '_LT_AC_LANG_CXX' => 1, 'AM_PROG_LIBTOOL' => 1, - 'AC_LIB_LTDL' => 1, - '_LT_AC_FILE_LTDLL_C' => 1, + '_LT_AC_LANG_CXX' => 1, 'AM_PROG_LD' => 1, + '_LT_AC_FILE_LTDLL_C' => 1, + 'AC_LIB_LTDL' => 1, 'DX_DOXYGEN_FEATURE' => 1, 'AU_DEFUN' => 1, 'AC_PROG_NM' => 1, @@ -113,58 +113,58 @@ '_AM_SET_OPTION' => 1, 'AC_LTDL_PREOPEN' => 1, '_LT_LINKER_BOILERPLATE' => 1, - 'AC_LIBTOOL_LANG_CXX_CONFIG' => 1, - 'AC_LIBTOOL_PROG_CC_C_O' => 1, '_LT_PREPARE_SED_QUOTE_VARS' => 1, + 'AC_LIBTOOL_PROG_CC_C_O' => 1, + 'AC_LIBTOOL_LANG_CXX_CONFIG' => 1, 'gl_PREREQ_ARGZ' => 1, 'AX_EXT' => 1, - 'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1, 'LT_SUPPORTED_TAG' => 1, - 'LT_SYS_MODULE_EXT' => 1, + 'AM_OUTPUT_DEPENDENCY_COMMANDS' => 1, 'LT_PROG_RC' => 1, + 'LT_SYS_MODULE_EXT' => 1, 'AC_DEFUN_ONCE' => 1, 'AX_CHECK_COMPILE_FLAG' => 1, 'DX_XML_FEATURE' => 1, '_LT_AC_LANG_GCJ' => 1, 'AC_LTDL_OBJDIR' => 1, - '_LT_PATH_TOOL_PREFIX' => 1, 'DX_TEST_FEATURE' => 1, + '_LT_PATH_TOOL_PREFIX' => 1, 'AC_LIBTOOL_RC' => 1, - 'AM_SILENT_RULES' => 1, - 'AC_DISABLE_FAST_INSTALL' => 1, '_LT_AC_PROG_ECHO_BACKSLASH' => 1, + 'AC_DISABLE_FAST_INSTALL' => 1, + 'AM_SILENT_RULES' => 1, 'DX_CHECK_DEPEND' => 1, 'DX_FEATURE_pdf' => 1, - '_LT_AC_SYS_LIBPATH_AIX' => 1, - '_LT_AC_TRY_DLOPEN_SELF' => 1, - 'DX_REQUIRE_PROG' => 1, 'include' => 1, + 'DX_REQUIRE_PROG' => 1, + '_LT_AC_TRY_DLOPEN_SELF' => 1, + '_LT_AC_SYS_LIBPATH_AIX' => 1, 'LT_AC_PROG_SED' => 1, 'AM_ENABLE_SHARED' => 1, 'DX_FEATURE_html' => 1, 'LTDL_INSTALLABLE' => 1, - '_LT_AC_LANG_GCJ_CONFIG' => 1, 'DX_CURRENT_DESCRIPTION' => 1, + '_LT_AC_LANG_GCJ_CONFIG' => 1, 'AC_ENABLE_SHARED' => 1, - 'AC_ENABLE_STATIC' => 1, - 'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1, '_LT_REQUIRED_DARWIN_CHECKS' => 1, - '_LT_AC_TAGVAR' => 1, - 'AM_PROG_CC_C_O' => 1, + 'AC_LIBTOOL_SYS_HARD_LINK_LOCKS' => 1, + 'AC_ENABLE_STATIC' => 1, 'AX_FUNC_POSIX_MEMALIGN' => 1, + 'AM_PROG_CC_C_O' => 1, + '_LT_AC_TAGVAR' => 1, 'AC_LIBTOOL_LANG_F77_CONFIG' => 1, 'AM_CONDITIONAL' => 1, 'LT_LIB_DLLOAD' => 1, 'DX_FEATURE_dot' => 1, - 'LTDL_INIT' => 1, - '_LT_PROG_F77' => 1, - '_LT_PROG_CXX' => 1, 'LTVERSION_VERSION' => 1, - 'AM_PROG_INSTALL_SH' => 1, + '_LT_PROG_CXX' => 1, + '_LT_PROG_F77' => 1, + 'LTDL_INIT' => 1, 'm4_include' => 1, + 'AM_PROG_INSTALL_SH' => 1, 'AC_PROG_EGREP' => 1, - '_AC_AM_CONFIG_HEADER_HOOK' => 1, 'AC_PATH_MAGIC' => 1, + '_AC_AM_CONFIG_HEADER_HOOK' => 1, 'AC_LTDL_SYSSEARCHPATH' => 1, 'AM_MAKE_INCLUDE' => 1, 'LT_CMD_MAX_LEN' => 1, @@ -180,51 +180,51 @@ 'AC_PROG_LD_RELOAD_FLAG' => 1, 'DX_FEATURE_chi' => 1, 'AC_LTDL_DLSYM_USCORE' => 1, - 'LT_LANG' => 1, 'AM_MISSING_HAS_RUN' => 1, + 'LT_LANG' => 1, 'LT_SYS_DLSEARCH_PATH' => 1, 'LT_CONFIG_LTDL_DIR' => 1, - 'LT_OUTPUT' => 1, 'AC_LIBTOOL_DLOPEN_SELF' => 1, + 'LT_OUTPUT' => 1, 'AC_LIBTOOL_PROG_LD_SHLIBS' => 1, - 'AC_LIBTOOL_LINKER_OPTION' => 1, 'AC_WITH_LTDL' => 1, + 'AC_LIBTOOL_LINKER_OPTION' => 1, 'DX_CHI_FEATURE' => 1, - 'AC_LIBTOOL_CXX' => 1, 'LT_AC_PROG_RC' => 1, + 'AC_LIBTOOL_CXX' => 1, 'LT_INIT' => 1, - 'LT_SYS_DLOPEN_SELF' => 1, - 'LT_AC_PROG_GCJ' => 1, 'AX_CHECK_COMPILER_FLAGS' => 1, + 'LT_AC_PROG_GCJ' => 1, + 'LT_SYS_DLOPEN_SELF' => 1, 'DX_CURRENT_FEATURE' => 1, - '_LT_AC_PROG_CXXCPP' => 1, - 'AM_DISABLE_STATIC' => 1, 'AM_DEP_TRACK' => 1, - '_AC_PROG_LIBTOOL' => 1, + 'AM_DISABLE_STATIC' => 1, + '_LT_AC_PROG_CXXCPP' => 1, 'AM_CONFIG_HEADER' => 1, - '_AM_IF_OPTION' => 1, + '_AC_PROG_LIBTOOL' => 1, 'DX_HTML_FEATURE' => 1, + '_AM_IF_OPTION' => 1, 'AC_PATH_TOOL_PREFIX' => 1, - 'AC_LIBTOOL_F77' => 1, 'm4_pattern_allow' => 1, + 'AC_LIBTOOL_F77' => 1, 'AM_SET_LEADING_DOT' => 1, - 'LT_AC_PROG_EGREP' => 1, '_LT_PROG_FC' => 1, - '_AM_DEPENDENCIES' => 1, + 'LT_AC_PROG_EGREP' => 1, 'DX_DIRNAME_EXPR' => 1, + '_AM_DEPENDENCIES' => 1, 'AC_LIBTOOL_LANG_C_CONFIG' => 1, 'LTOPTIONS_VERSION' => 1, '_LT_AC_SYS_COMPILER' => 1, - 'AM_PROG_NM' => 1, 'DX_FEATURE_man' => 1, + 'AM_PROG_NM' => 1, 'DX_PS_FEATURE' => 1, 'AC_LIBLTDL_CONVENIENCE' => 1, 'AC_DEPLIBS_CHECK_METHOD' => 1, - 'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1, 'AC_LIBLTDL_INSTALLABLE' => 1, + 'AM_SET_CURRENT_AUTOMAKE_VERSION' => 1, 'AC_LTDL_ENABLE_INSTALL' => 1, - 'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1, 'LT_PROG_GCJ' => 1, + 'AC_LIBTOOL_SYS_DYNAMIC_LINKER' => 1, 'DX_FEATURE_chm' => 1, 'AM_INIT_AUTOMAKE' => 1, 'DX_FEATURE_rtf' => 1, @@ -235,32 +235,32 @@ '_LT_AC_LOCK' => 1, '_LT_AC_LANG_RC_CONFIG' => 1, 'LT_PROG_GO' => 1, - 'LT_SYS_MODULE_PATH' => 1, 'DX_ENV_APPEND' => 1, - 'AC_LIBTOOL_POSTDEP_PREDEP' => 1, + 'LT_SYS_MODULE_PATH' => 1, 'LT_WITH_LTDL' => 1, - 'AC_LTDL_SHLIBPATH' => 1, - 'AX_GCC_ARCHFLAG' => 1, + 'AC_LIBTOOL_POSTDEP_PREDEP' => 1, 'DX_ARG_ABLE' => 1, + 'AX_GCC_ARCHFLAG' => 1, + 'AC_LTDL_SHLIBPATH' => 1, 'AX_OPENMP' => 1, 'AM_AUX_DIR_EXPAND' => 1, - '_LT_AC_LANG_F77_CONFIG' => 1, 'AC_LIBTOOL_PROG_COMPILER_NO_RTTI' => 1, - '_AM_SET_OPTIONS' => 1, + '_LT_AC_LANG_F77_CONFIG' => 1, '_LT_COMPILER_OPTION' => 1, - '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1, + '_AM_SET_OPTIONS' => 1, 'AM_RUN_LOG' => 1, - 'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1, - 'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1, + '_AM_OUTPUT_DEPENDENCY_COMMANDS' => 1, 'AC_LIBTOOL_PICMODE' => 1, - 'LT_PATH_LD' => 1, - 'AC_CHECK_LIBM' => 1, + 'AC_LTDL_SYS_DLOPEN_DEPLIBS' => 1, + 'AC_LIBTOOL_SYS_OLD_ARCHIVE' => 1, 'ACX_PTHREAD' => 1, + 'AC_CHECK_LIBM' => 1, + 'LT_PATH_LD' => 1, 'AC_LIBTOOL_SYS_LIB_STRIP' => 1, '_AM_MANGLE_OPTION' => 1, - 'AC_LTDL_SYMBOL_USCORE' => 1, - 'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1, 'DX_FEATURE_xml' => 1, + 'AC_LIBTOOL_SYS_MAX_CMD_LEN' => 1, + 'AC_LTDL_SYMBOL_USCORE' => 1, 'AM_SET_DEPDIR' => 1, '_LT_CC_BASENAME' => 1, 'DX_FEATURE_ps' => 1, @@ -280,57 +280,57 @@ 'configure.in' ], { - '_LT_AC_TAGCONFIG' => 1, 'AM_PROG_F77_C_O' => 1, - 'AC_INIT' => 1, + '_LT_AC_TAGCONFIG' => 1, 'm4_pattern_forbid' => 1, - '_AM_COND_IF' => 1, + 'AC_INIT' => 1, 'AC_CANONICAL_TARGET' => 1, - 'AC_SUBST' => 1, + '_AM_COND_IF' => 1, 'AC_CONFIG_LIBOBJ_DIR' => 1, - 'AC_FC_SRCEXT' => 1, + 'AC_SUBST' => 1, 'AC_CANONICAL_HOST' => 1, + 'AC_FC_SRCEXT' => 1, 'AC_PROG_LIBTOOL' => 1, 'AM_INIT_AUTOMAKE' => 1, - 'AM_PATH_GUILE' => 1, 'AC_CONFIG_SUBDIRS' => 1, + 'AM_PATH_GUILE' => 1, 'AM_AUTOMAKE_VERSION' => 1, 'LT_CONFIG_LTDL_DIR' => 1, - 'AC_REQUIRE_AUX_FILE' => 1, 'AC_CONFIG_LINKS' => 1, - 'm4_sinclude' => 1, + 'AC_REQUIRE_AUX_FILE' => 1, 'LT_SUPPORTED_TAG' => 1, + 'm4_sinclude' => 1, 'AM_MAINTAINER_MODE' => 1, 'AM_NLS' => 1, 'AC_FC_PP_DEFINE' => 1, 'AM_GNU_GETTEXT_INTL_SUBDIR' => 1, - 'AM_MAKEFILE_INCLUDE' => 1, '_m4_warn' => 1, + 'AM_MAKEFILE_INCLUDE' => 1, 'AM_PROG_CXX_C_O' => 1, - '_AM_COND_ENDIF' => 1, '_AM_MAKEFILE_INCLUDE' => 1, + '_AM_COND_ENDIF' => 1, 'AM_ENABLE_MULTILIB' => 1, 'AM_SILENT_RULES' => 1, 'AM_PROG_MOC' => 1, 'AC_CONFIG_FILES' => 1, - 'include' => 1, 'LT_INIT' => 1, - 'AM_PROG_AR' => 1, + 'include' => 1, 'AM_GNU_GETTEXT' => 1, + 'AM_PROG_AR' => 1, 'AC_LIBSOURCE' => 1, - 'AM_PROG_FC_C_O' => 1, 'AC_CANONICAL_BUILD' => 1, + 'AM_PROG_FC_C_O' => 1, 'AC_FC_FREEFORM' => 1, - 'AH_OUTPUT' => 1, 'AC_FC_PP_SRCEXT' => 1, - '_AM_SUBST_NOTMAKE' => 1, + 'AH_OUTPUT' => 1, 'AC_CONFIG_AUX_DIR' => 1, - 'sinclude' => 1, - 'AM_PROG_CC_C_O' => 1, + '_AM_SUBST_NOTMAKE' => 1, 'm4_pattern_allow' => 1, - 'AM_XGETTEXT_OPTION' => 1, - 'AC_CANONICAL_SYSTEM' => 1, + 'AM_PROG_CC_C_O' => 1, + 'sinclude' => 1, 'AM_CONDITIONAL' => 1, + 'AC_CANONICAL_SYSTEM' => 1, + 'AM_XGETTEXT_OPTION' => 1, 'AC_CONFIG_HEADERS' => 1, 'AC_DEFINE_TRACE_LITERAL' => 1, 'AM_POT_TOOLS' => 1, @@ -351,57 +351,57 @@ 'configure.in' ], { - 'AM_PROG_F77_C_O' => 1, '_LT_AC_TAGCONFIG' => 1, - 'm4_pattern_forbid' => 1, + 'AM_PROG_F77_C_O' => 1, 'AC_INIT' => 1, - '_AM_COND_IF' => 1, + 'm4_pattern_forbid' => 1, 'AC_CANONICAL_TARGET' => 1, - 'AC_SUBST' => 1, + '_AM_COND_IF' => 1, 'AC_CONFIG_LIBOBJ_DIR' => 1, - 'AC_FC_SRCEXT' => 1, + 'AC_SUBST' => 1, 'AC_CANONICAL_HOST' => 1, + 'AC_FC_SRCEXT' => 1, 'AC_PROG_LIBTOOL' => 1, 'AM_INIT_AUTOMAKE' => 1, - 'AM_PATH_GUILE' => 1, 'AC_CONFIG_SUBDIRS' => 1, + 'AM_PATH_GUILE' => 1, 'AM_AUTOMAKE_VERSION' => 1, 'LT_CONFIG_LTDL_DIR' => 1, - 'AC_CONFIG_LINKS' => 1, 'AC_REQUIRE_AUX_FILE' => 1, - 'm4_sinclude' => 1, + 'AC_CONFIG_LINKS' => 1, 'LT_SUPPORTED_TAG' => 1, + 'm4_sinclude' => 1, 'AM_MAINTAINER_MODE' => 1, 'AM_NLS' => 1, 'AC_FC_PP_DEFINE' => 1, 'AM_GNU_GETTEXT_INTL_SUBDIR' => 1, - 'AM_MAKEFILE_INCLUDE' => 1, '_m4_warn' => 1, + 'AM_MAKEFILE_INCLUDE' => 1, 'AM_PROG_CXX_C_O' => 1, - '_AM_MAKEFILE_INCLUDE' => 1, '_AM_COND_ENDIF' => 1, + '_AM_MAKEFILE_INCLUDE' => 1, 'AM_ENABLE_MULTILIB' => 1, 'AM_PROG_MOC' => 1, 'AM_SILENT_RULES' => 1, 'AC_CONFIG_FILES' => 1, - 'LT_INIT' => 1, 'include' => 1, - 'AM_GNU_GETTEXT' => 1, + 'LT_INIT' => 1, 'AM_PROG_AR' => 1, + 'AM_GNU_GETTEXT' => 1, 'AC_LIBSOURCE' => 1, - 'AM_PROG_FC_C_O' => 1, 'AC_CANONICAL_BUILD' => 1, + 'AM_PROG_FC_C_O' => 1, 'AC_FC_FREEFORM' => 1, - 'AH_OUTPUT' => 1, 'AC_FC_PP_SRCEXT' => 1, - 'AC_CONFIG_AUX_DIR' => 1, + 'AH_OUTPUT' => 1, '_AM_SUBST_NOTMAKE' => 1, - 'm4_pattern_allow' => 1, - 'sinclude' => 1, + 'AC_CONFIG_AUX_DIR' => 1, 'AM_PROG_CC_C_O' => 1, - 'AM_XGETTEXT_OPTION' => 1, - 'AC_CANONICAL_SYSTEM' => 1, + 'sinclude' => 1, + 'm4_pattern_allow' => 1, 'AM_CONDITIONAL' => 1, + 'AC_CANONICAL_SYSTEM' => 1, + 'AM_XGETTEXT_OPTION' => 1, 'AC_CONFIG_HEADERS' => 1, 'AC_DEFINE_TRACE_LITERAL' => 1, 'AM_POT_TOOLS' => 1, diff --git a/examples/test_qr.c b/examples/test_qr.c index 97860917eb98eb1928a1ab37ce0568bc3f76ec38..d5d2fb8db9991dba697b0d0620976955b3d3f1ac 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -34,6 +34,17 @@ /* Local includes. */ #include "quicksched.h" +void printMatrix(double* Matrix, int m, int n, int tilesize) { + int i, j; + + for (i = 0; i < m * tilesize; i++) { + for (j = 0; j < n * tilesize; j++) { + printf(" %.3f ", Matrix[j * m * tilesize + i]); + } + printf("\n"); + } +} + /** * Takes a column major matrix, NOT tile major. size is length of a side of the * matrix. Only works for square matrices. @@ -85,14 +96,13 @@ double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) { 1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; } } - /* Qtemp = Qtemp * (I-tau*w*w')' */ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Qtemp, size, ww, size, 0.0, temp, size); double* b = Qtemp; Qtemp = temp; temp = b; - } + } cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size); double* b = Q; @@ -120,16 +130,7 @@ double* getR(double* HR, int size) { return R; } -void printMatrix(double* Matrix, int m, int n, int tilesize) { - int i, j; - for (i = 0; i < m * tilesize; i++) { - for (j = 0; j < n * tilesize; j++) { - printf(" %.3f ", Matrix[j * m * tilesize + i]); - } - printf("\n"); - } -} double* columnToTile(double* columnMatrix, int size, int m, int n, int tilesize) { @@ -395,6 +396,20 @@ void DTSQRF(double* restrict cornerTile, double* restrict columnTile, } } +/* Generates a random matrix. */ +double* generateColumnMatrix(int size, unsigned long int m_z) { + double* matrix = malloc(sizeof(double) * size); + if (matrix == NULL) error("Failed to allocate matrix"); + + int i; + for (i = 0; i < size; i++) { + m_z = (1664525 * m_z + 1013904223) % 4294967296; + matrix[i] = m_z % 100; + if (matrix[i] < 0) matrix[i] += 100; + } + return matrix; +} + /** * * @brief Applies the householder factorisation of the corner to the row tile. @@ -511,6 +526,8 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { else A_orig[k] = matrix[k]; } + if(matrix == NULL) + A_orig = generateColumnMatrix(m*n*K*K, 35532); memcpy(A, A_orig, sizeof(double) * m * n * K * K); bzero(tau, sizeof(double) * m * n * K); @@ -588,7 +605,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { qsched_addlock(&s, tid_new, rid[j * m + i]); qsched_adduse(&s, tid_new, rid[k * m + i]); qsched_adduse(&s, tid_new, rid[j * m + k]); - // qsched_addunlock(&s, tid[k * m + i], tid_new); + qsched_addunlock(&s, tid[k * m + i], tid_new); qsched_addunlock(&s, tid[j * m + i - 1], tid_new); if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); @@ -600,11 +617,11 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { tot_setup = getticks() - tic; /* Dump the number of tasks. */ - message("total nr of tasks: %i.", s.count); - message("total nr of deps: %i.", s.count_deps); - message("total nr of res: %i.", s.count_res); - message("total nr of locks: %i.", s.count_locks); - message("total nr of uses: %i.", s.count_uses); + //message("total nr of tasks: %i.", s.count); + //message("total nr of deps: %i.", s.count_deps); + //message("total nr of res: %i.", s.count_res); + //message("total nr of locks: %i.", s.count_locks); + //message("total nr of uses: %i.", s.count_uses); /* Loop over the number of runs. */ for (k = 0; k < runs; k++) { @@ -613,7 +630,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { tic = getticks(); qsched_run(&s, nr_threads, runner); toc_run = getticks(); - message("%ith run took %lli ticks...", k, toc_run - tic); + // message("%ith run took %lli ticks...", k, toc_run - tic); tot_run += toc_run - tic; } @@ -643,20 +660,21 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { } */ /* Dump the costs. */ - message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup, - tot_run / runs); + double itpms = 1000.0 / CPU_TPS; + message("costs: setup=%.3f ms, run=%.3f ms.", ((double)tot_setup)*itpms, + ((double)tot_run)*itpms); /* Dump the timers. */ - for (k = 0; k < qsched_timer_count; k++) + /*for (k = 0; k < qsched_timer_count; k++) message("timer %s is %lli ticks.", qsched_timer_names[k], - s.timers[k] / runs); + s.timers[k] / runs);*/ if (matrix != NULL) { for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k]; } /* Test if the decomposition was correct.*/ - /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); +/* double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); double *Q = computeQ(tempMatrix, m*K, K, tau, m); double *R = getR(tempMatrix, m*K); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, @@ -683,19 +701,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { qsched_free(&s); } -/* Generates a random matrix. */ -double* generateColumnMatrix(int size, unsigned long int m_z) { - double* matrix = malloc(sizeof(double) * size * size); - if (matrix == NULL) error("Failed to allocate matrix"); - int i; - for (i = 0; i < size * size; i++) { - m_z = (1664525 * m_z + 1013904223) % 4294967296; - matrix[i] = m_z % 100; - if (matrix[i] < 0) matrix[i] += 100; - } - return matrix; -} /* Unit tests for the tiled QR decomposition.*/ @@ -703,10 +709,10 @@ void test_SSSRFT() { double* cornerTile, *rowTile, *columnTile, *otherTile; double tau[1024]; - cornerTile = generateColumnMatrix(32, 35136); - rowTile = generateColumnMatrix(32, 35239); - columnTile = generateColumnMatrix(32, 35339); - otherTile = generateColumnMatrix(32, 35739); + cornerTile = generateColumnMatrix(32*32, 35136); + rowTile = generateColumnMatrix(32*32, 35239); + columnTile = generateColumnMatrix(32*32, 35339); + otherTile = generateColumnMatrix(32*32, 35739); DGEQRF(cornerTile, 32, tau, 0, 2); DTSQRF(cornerTile, columnTile, 32, 1, 0, tau, @@ -733,8 +739,8 @@ void test_STSQRF() { double *cornerTile, *columnTile; double tau[1024]; - cornerTile = generateColumnMatrix(32, 35531); - columnTile = generateColumnMatrix(32, 35585); + cornerTile = generateColumnMatrix(32*32, 35531); + columnTile = generateColumnMatrix(32*32, 35585); DGEQRF(cornerTile, 32, tau, 0, 2); DTSQRF(cornerTile, columnTile, 32, 1, 0, tau, @@ -749,8 +755,8 @@ void test_SLARFT() { double* cornerTile, *rowTile; double tau[1024]; - cornerTile = generateColumnMatrix(32, 35536); - rowTile = generateColumnMatrix(32, 35539); + cornerTile = generateColumnMatrix(32*32, 35536); + rowTile = generateColumnMatrix(32*32, 35539); DGEQRF(cornerTile, 32,tau, 0, 2); DLARFT(cornerTile,rowTile, 32, 1, 0, tau, 2); @@ -766,7 +772,7 @@ void test_SGEQRF() { double* cornerTile; double tau[1024]; - cornerTile = generateColumnMatrix(32, 35532); + cornerTile = generateColumnMatrix(32*32, 35532); DGEQRF( cornerTile, 32, tau, 0, 2); printMatrix(cornerTile, 1, 1, 32); @@ -774,10 +780,10 @@ void test_SGEQRF() } void runTests() { - // test_SGEQRF(); + test_SGEQRF(); // test_SLARFT(); // test_STSQRF(); - test_SSSRFT(); +// test_SSSRFT(); } diff --git a/examples/test_qr.cu b/examples/test_qr.cu index 95b6ad51ea76ebe51707e5ad37bd33046bf6338b..1e8be2456b665d6f53395e0a07ed1604c2d60687 100644 --- a/examples/test_qr.cu +++ b/examples/test_qr.cu @@ -7,13 +7,18 @@ #include <string.h> #include <unistd.h> #include <math.h> +#include <gperftools/profiler.h> + + /* Local includes. */ extern "C"{ #include "quicksched.h" +#include <cblas.h> } #include "cuda_queue.h" +int g_size; enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ; //#define TID threadIdx.x @@ -28,6 +33,7 @@ __device__ float *GPU_tau; __device__ int cuda_m; __device__ int cuda_n; + void printMatrix(float *matrix, int m, int n, int tilesize) { int i, j; @@ -44,16 +50,110 @@ void printMatrix(float *matrix, int m, int n, int tilesize) } +/** + * Takes a column major matrix, NOT tile major. size is length of a side of the + * matrix. Only works for square matrices. + * This function is simply for validation and is implemented naively as we know + * of no implementation to retrieve Q from the tiled QR. + */ +float* computeQ(float* HR, int size, int tilesize, float* tau, int tauNum) { + float* Q = (float*)malloc(sizeof(float) * size * size); + float* Qtemp = (float*)malloc(sizeof(float) * size * size); + float* w = (float*)malloc(sizeof(float) * size); + float* ww = (float*)malloc(sizeof(float) * size * size); + float* temp = (float*)malloc(sizeof(float) * size * size); + int i, k, l, j, n; + bzero(Q, sizeof(float) * size * size); + bzero(Qtemp, sizeof(float) * size * size); + bzero(ww, sizeof(float) * size * size); + + for (i = 0; i < size; i++) { + Q[i * size + i] = 1.0; + } + int numcoltile = size / tilesize; + int numrowtile = size / tilesize; + for (k = 0; k < numrowtile; k++) { + for (l = 0; l < tilesize; l++) { + bzero(Qtemp, sizeof(float) * size * size); + for (i = 0; i < size; i++) { + Qtemp[i * size + i] = 1.0; + } + + for (i = k; i < numcoltile; i++) { + bzero(w, sizeof(float) * size); + + for (j = 0; j < tilesize; j++) { + w[i * tilesize + j] = + HR[(k * tilesize + l) * size + i * tilesize + j]; + } + w[k * tilesize + l] = 1.0; + if (k * tilesize + l > i * tilesize) { + for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0; + } + + /* Compute (I - tau*w*w')' */ + for (j = 0; j < size; j++) { + for (n = 0; n < size; n++) { + if (j != n) + ww[n * size + j] = + -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + else + ww[n * size + j] = + 1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + } + } + /* Qtemp = Qtemp * (I-tau*w*w')' */ + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Qtemp, size, ww, size, 0.0, temp, size); + float* b = Qtemp; + Qtemp = temp; + temp = b; + + } + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Q, size, Qtemp, size, 0.0, temp, size); + float* b = Q; + Q = temp; + temp = b; + } + } + + free(Qtemp); + free(w); + free(ww); + free(temp); + return Q; +} + +float* getR(float* HR, int size) { + float* R = (float*)malloc(sizeof(float) * size * size); + int i, j; + bzero(R, sizeof(float) * size * size); + for (i = 0; i < size; i++) { + for (j = 0; j <= i; j++) { + R[i * size + j] = HR[i * size + j]; + } + } + return R; +} + + + /* Generates a random matrix. */ float* generateColumnMatrix(int size, unsigned long int m_z) { float* matrix; - cudaMallocHost(&matrix, sizeof(float)*size*size); + + cudaError_t code = cudaMallocHost(&matrix, sizeof(float)*size/**size*/); + if(code != cudaSuccess) + printf("%s size = %i g_size = %i\n", cudaGetErrorString(code),size, g_size); + else + g_size = g_size + sizeof(float)*size*size; if(matrix == NULL) error("Failed to allocate matrix"); int i; - for(i = 0 ; i < size*size; i++) + for(i = 0 ; i < size; i++) { m_z = (1664525*m_z + 1013904223) % 4294967296; matrix[i] = m_z % 100; @@ -454,43 +554,39 @@ __device__ void runner ( int type , void *data ) { workVector = blockCache + (32*32); /* Decode the task data. */ int *idata = (int *)data; - int i = idata[0], j = idata[1], k = idata[2]; + int i = idata[0]; + int j = idata[1]; + int k = idata[2]; // int z; // double buff[ 2*K*K ]; /* Decode and execute the task. */ switch ( type ) { case task_SGEQRF: + //if(threadIdx.x == 0) + // printf("SGEQRF %i %i\n", k, (k*cuda_m+k)*32*32); if(threadIdx.x < 32) -// runner_cuda_SGEQRF(i, j, k, workVector, blockCache); - - if(threadIdx.x == 0){ - printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock()); - printf("tau = "); - for(k = 0; k < cuda_m * cuda_n * 32 ; k++) - { - printf("%.3f ", GPU_tau[k]); - } - printf("\n");} + SGEQRF( &GPU_matrix[(k*cuda_m+k)*32*32], 32, GPU_tau, k, cuda_m); break; case task_SLARFT: - if(threadIdx.x == 0) - printf("SLARFT: %i %i %i + %i\n",i,j,k, clock()); - //runner_cuda_SLARFT(i, j, k, workVector, blockCache); + // if(threadIdx.x == 0) + // printf("SLARFT %i %i %i\n", k,j, (j*cuda_m+k)*32*32); + SLARFT( &GPU_matrix[(k*cuda_m +k)*32*32], &GPU_matrix[(j*cuda_m+k)*32*32], 32, j, k, GPU_tau, cuda_m); break; case task_STSQRF: - if(threadIdx.x == 0) - printf("STSQRF: %i %i %i + %i\n",i,j,k, clock()); - //runner_cuda_STSQRF(i, j, k, workVector, blockCache); + // if(threadIdx.x == 0) + // printf("STSQRF %i %i %i\n", k,i, (k*cuda_m+i)*32*32); + if(threadIdx.x < 32) + STSQRF( &GPU_matrix[(k*cuda_m+k)*32*32], &GPU_matrix[(k*cuda_m + i)*32*32], 32, i, k, GPU_tau, cuda_m); break; case task_SSSRFT: - if(threadIdx.x == 0) - printf("SSSRFT: %i %i %i + %i\n",i,j,k, clock()); - //runner_cuda_SSSRFT(i, j, k, workVector, blockCache); + // if(threadIdx.x == 0) + // printf("SSSRFT %i %i %i %i\n", k,j, i, (j*cuda_m+i)*32*32); + SSSRFT( &GPU_matrix[(j*cuda_m+i)*32*32],&GPU_matrix[(k*cuda_m + i)*32*32],&GPU_matrix[(j*cuda_m+k)*32*32], 32, i, j, k, GPU_tau, cuda_m ); break; default: asm("trap;"); } - +__syncthreads(); } __device__ qsched_funtype function = runner; @@ -505,6 +601,7 @@ float* generateMatrix( int m, int n) { float* Matrix; cudaMallocHost(&Matrix, sizeof(float) * m*n*32*32); + g_size = g_size + sizeof(float)*m*n*32*32; if(Matrix == NULL) error("Failed to allocate Matrix"); int i, j; @@ -557,40 +654,33 @@ float* columnToTile( float* columnMatrix, int size , int m , int n ) } -float* tileToColumn( float* tileMatrix, int size, int m , int n ) -{ - float* ColumnMatrix; - ColumnMatrix = (float*) malloc(sizeof(float) * size ); - if(ColumnMatrix == NULL) - error("failed to allocate ColumnMatrix"); - int rows = m*32; - int columns = n*32; - int i,j,k,l; - for( i = 0; i < n ; i++ ) - { - for(j = 0; j < m ; j++ ) - { - /* Tile on ith column is at i*m*32*32.*/ - /* Tile on jth is at j*32*32 */ - float *tile = &tileMatrix[i*m*32*32 + j*32*32]; - /* Column starts at same position as tile. */ - /* Row j*32.*/ - float *tilePos = &ColumnMatrix[i*m*32*32 + j*32]; - for( k = 0; k < 32; k++ ) - { - for(l=0; l < 32; l++) - { - tilePos[l] = tile[l]; - } - /* Next 32 elements are the position of the tile in the next column.*/ - tile = &tile[32]; - /* Move to the j*32th position in the next column. */ - tilePos = &tilePos[32*m]; - - } - } +float* tileToColumn(float* tileMatrix, int size, int m, int n, int tilesize) { + float* ColumnMatrix; + ColumnMatrix = (float*)malloc(sizeof(float) * size); + if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix"); + int i, j, k, l; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + float* tile = + &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + float* tilePos = + &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize]; + for (k = 0; k < tilesize; k++) { + for (l = 0; l < tilesize; l++) { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[tilesize]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[tilesize * m]; + } } - return ColumnMatrix; + } + return ColumnMatrix; } @@ -672,24 +762,29 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) /* Initialize the scheduler. */ qsched_init( &s , 1 , qsched_flag_none ); cudaDeviceReset(); + if( cudaPeekAtLastError() != cudaSuccess) + error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError())); cudaSetDevice(0); Setup<<<1,1>>>(); - if(cudaDeviceSynchronize() != cudaSuccess) + g_size = 0; + if(cudaDeviceSynchronize() != cudaSuccess) error("Setup Failed: %s", cudaGetErrorString(cudaPeekAtLastError())); /* Allocate and fill the original matrix. */ if(cudaMallocHost(&A, sizeof(float) * m * n * K * K ) != cudaSuccess || - cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess || - cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess ) + cudaMallocHost(&tau, sizeof(float) * m * n * K ) != cudaSuccess /*|| + cudaMallocHost(&A_orig, sizeof(float) * m * n * K * K ) != cudaSuccess*/ ) error("Failed to allocate matrices."); - cudaFreeHost(A_orig); + g_size = g_size + sizeof(float) * m * n * K * K + sizeof(float) * m * n * K; + //cudaFreeHost(A_orig); // for ( k = 0 ; k < m * n * K * K ; k++ ) // A_orig[k] = 2.0f*((float)rand()) / RAND_MAX - 1.0f; - A_orig = generateMatrix(m, n); + A_orig = generateColumnMatrix(m*n*K*K, 35532); + // printMatrix(A_orig, m, n); float *temp = columnToTile(A_orig, m * n * K *K, m , n); cudaFreeHost(A_orig); A_orig = temp; - temp = tileToColumn(A_orig, m*n*K*K, m , n); + temp = tileToColumn(A_orig, m*n*K*K, m , n, K); // printMatrix(temp, m, n); // printTileMatrix(A_orig, m, n); @@ -721,6 +816,7 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) tid[k] = qsched_task_none; if( cudaHostGetDevicePointer(&temp_device_array , &A[k*32*32] , 0) != cudaSuccess) error("Failed to get device pointer to matrix"); + rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , &A[k*32*32], sizeof(float) * 32 * 32, device_array + k * 32 *32); } @@ -730,22 +826,20 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) /* Add kth corner task. */ data[0] = k; data[1] = k; data[2] = k; tid_new = qsched_addtask( &s , task_SGEQRF , task_flag_none , data , sizeof(int)*3 , 2 ); - qsched_addlock( &s , tid_new , rid[ k*m + k ] ); - if ( tid[ k*m + k ] != -1 ) - qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); - tid[ k*m + k ] = tid_new; + qsched_addlock(&s, tid_new, rid[k * m + k]); + if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new); + tid[k * m + k] = tid_new; /* Add column tasks on kth row. */ for ( j = k+1 ; j < n ; j++ ) { data[0] = k; data[1] = j; data[2] = k; tid_new = qsched_addtask( &s , task_SLARFT , task_flag_none , data , sizeof(int)*3 , 3 ); - qsched_addlock( &s , tid_new , rid[ j*m + k ] ); - qsched_adduse( &s , tid_new , rid[ k*m + k ] ); - qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); - if ( tid[ j*m + k ] != -1 ) - qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); - tid[ j*m + k ] = tid_new; + qsched_addlock(&s, tid_new, rid[j * m + k]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addunlock(&s, tid[k * m + k], tid_new); + if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new); + tid[j * m + k] = tid_new; } /* For each following row... */ @@ -754,25 +848,23 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) /* Add the row tasks for the kth column. */ data[0] = i; data[1] = k; data[2] = k; tid_new = qsched_addtask( &s , task_STSQRF , task_flag_none , data , sizeof(int)*3 , 3 ); - qsched_addlock( &s , tid_new , rid[ k*m + i ] ); - qsched_adduse( &s , tid_new , rid[ k*m + k ] ); - qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new ); - if ( tid[ k*m + i ] != -1 ) - qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - tid[ k*m + i ] = tid_new; - + qsched_addlock(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); + if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); + tid[k * m + i] = tid_new; + /* Add the inner tasks. */ for ( j = k+1 ; j < n ; j++ ) { data[0] = i; data[1] = j; data[2] = k; tid_new = qsched_addtask( &s , task_SSSRFT , task_flag_none , data , sizeof(int)*3 , 5 ); - qsched_addlock( &s , tid_new , rid[ j*m + i ] ); - qsched_adduse( &s , tid_new , rid[ k*m + i ] ); - qsched_adduse( &s , tid_new , rid[ j*m + k ] ); - qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); - if ( tid[ j*m + i ] != -1 ) - qsched_addunlock( &s , tid[ j*m + i ] , tid_new ); - tid[ j*m + i ] = tid_new; + qsched_addlock(&s, tid_new, rid[j * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[j * m + k]); + qsched_addunlock(&s, tid[k * m + i], tid_new); + qsched_addunlock(&s, tid[j * m + i - 1], tid_new); + if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); + tid[j * m + i] = tid_new; } } @@ -787,11 +879,42 @@ void test_qr(int m , int n , int K , int nr_threads , int runs) if( cudaMemcpyToSymbol( cuda_n, &n, sizeof(int), 0, cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy n to the device"); + qsched_run_CUDA( &s , func ); - cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice); - A = tileToColumn(A,m * n * K * K, m, n); - printMatrix(A, m, n); + + if(cudaMemcpy( tau , tau_device , sizeof(float) * m * n * K , cudaMemcpyDeviceToHost ) != cudaSuccess ) + error("Failed to copy the tau data from the device."); + + // printMatrix(tileToColumn(A, m*n*K*K, m, n, K), m, n); + /*float *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); + float *Q = computeQ(tempMatrix, m*K, K, tau, m); + float *R = getR(tempMatrix, m*K); + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, + m*K, R, m*K, 0.0, tempMatrix, m*K); + free(Q); + // printMatrix(tempMatrix, m, n); + // printf("\n\n\n\n"); + Q = tileToColumn(A_orig, m*n*K*K, m, n, K); + for(i = 0; i < m * n * K * K; i++) + { + if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < + 0.995)) + printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], + tempMatrix[i]); + } + free(tempMatrix); + free(Q); + free(R);*/ + +// cudaMemcpy( A , device_array , sizeof(float) * m * n * K * K, cudaMemcpyHostToDevice); + // A = tileToColumn(A,m * n * K * K, m, n, K); + // printMatrix(A, m, n); // printTileMatrix(A, m , n); + struct task* tasks = qsched_get_timers( &s , s.count ); + /*for(i = 0; i < s.count; i++) + { + printf("%i %lli %lli %i\n", tasks[i].type, tasks[i].tic, tasks[i].toc , tasks[i].blockID); + }*/ } @@ -812,7 +935,7 @@ void test() printf("\n \n \n"); //printTileMatrix(MatrixTile, m, n); free(Matrix); - Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n); + Matrix = tileToColumn(MatrixTile, m*n*32*32, m , n, 32); printMatrix(Matrix, m, n); free(MatrixTile); free(Matrix); @@ -832,11 +955,11 @@ __global__ void SGEQRF_test(float* cornerTile) for(i = threadIdx.x; i < 32*32; i+=blockDim.x) { - printf("Copying value %i\n", i); + // printf("Copying value %i\n", i); tile[i] = cornerTile[i]; } - printf("blockDim.x = %i\n", blockDim.x); + // printf("blockDim.x = %i\n", blockDim.x); __syncthreads(); if(threadIdx.x < 32) SGEQRF(tile, 32, tau, 0, 1); @@ -849,7 +972,7 @@ __global__ void SGEQRF_test(float* cornerTile) void test_SGEQRF() { float* cornerTile; - cornerTile = generateColumnMatrix(32, 35532); + cornerTile = generateColumnMatrix(32*32, 35532); SGEQRF_test<<<1, 128>>>(cornerTile); cudaDeviceSynchronize(); printMatrix(cornerTile, 1, 1, 32); @@ -886,8 +1009,8 @@ __global__ void SLARFT_test(float* cornerTile, float* rowTile) void test_SLARFT() { float* cornerTile, *rowTile; - cornerTile = generateColumnMatrix(32, 35536); - rowTile = generateColumnMatrix(32, 35539); + cornerTile = generateColumnMatrix(32*32, 35536); + rowTile = generateColumnMatrix(32*32, 35539); SLARFT_test<<<1, 128>>>(cornerTile, rowTile); cudaDeviceSynchronize(); printMatrix(cornerTile, 1, 1, 32); @@ -923,8 +1046,8 @@ __global__ void STSQRF_test(float* cornerTile, float* columnTile) void test_STSQRF() { float *cornerTile, *columnTile; - cornerTile = generateColumnMatrix(32, 35531); - columnTile = generateColumnMatrix(32, 35585); + cornerTile = generateColumnMatrix(32*32, 35531); + columnTile = generateColumnMatrix(32*32, 35585); STSQRF_test<<<1,32>>>(cornerTile, columnTile); cudaDeviceSynchronize(); printMatrix(cornerTile, 1, 1, 32); @@ -971,10 +1094,10 @@ __global__ void SSSRFT_test(float* cornerTile, float* columnTile, float* rowTile void test_SSSRFT() { float* cornerTile, *rowTile, *columnTile, *otherTile; - cornerTile = generateColumnMatrix(32, 35136); - rowTile = generateColumnMatrix(32, 35239); - columnTile = generateColumnMatrix(32, 35339); - otherTile = generateColumnMatrix(32, 35739); + cornerTile = generateColumnMatrix(32*32, 35136); + rowTile = generateColumnMatrix(32*32, 35239); + columnTile = generateColumnMatrix(32*32, 35339); + otherTile = generateColumnMatrix(32*32, 35739); SSSRFT_test<<<1, 128>>>(cornerTile, columnTile, rowTile, otherTile); cudaDeviceSynchronize(); printMatrix(cornerTile, 1, 1, 32); @@ -995,10 +1118,10 @@ void test_SSSRFT() void runTests() { - // test_SGEQRF(); + test_SGEQRF(); // test_SLARFT(); // test_STSQRF(); - test_SSSRFT(); + // test_SSSRFT(); } /** * @brief Main function. @@ -1056,5 +1179,6 @@ int main ( int argc , char *argv[] ) { test_qr( M , N , K , nr_threads , runs ); + printf("\n"); } diff --git a/src/cuda_queue.cu b/src/cuda_queue.cu index 6e35f8390e5f8f9cfa1f858e980587393b7555a8..45add67c108e3d9a0c2ddcfdb07d76c4eb58a1b8 100644 --- a/src/cuda_queue.cu +++ b/src/cuda_queue.cu @@ -23,6 +23,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <gperftools/profiler.h> extern "C"{ #include "quicksched.h" @@ -52,6 +53,7 @@ __device__ qsched_task_t *deps_cuda; __device__ struct res *res_cuda; __device__ char *data_cuda; __device__ int cuda_barrier = 0; +__device__ volatile int tot_num_tasks; __device__ qsched_funtype fun; /** @@ -95,12 +97,11 @@ __device__ __inline__ void cuda_memcpy_tasks ( void *dest , void *source , int c /* Copy the data in chunks of sizeof(int). */ for ( k = threadIdx.x ; k < count/sizeof(int) ; k += blockDim.x ){ idest[k] = isource[k]; -// idest[k] *= tid; } } -__device__ int cuda_queue_gettask ( struct queue_cuda *q, int wait ) { +__device__ int cuda_queue_gettask ( struct queue_cuda *q ) { int ind, tid = -1; @@ -114,12 +115,14 @@ __device__ int cuda_queue_gettask ( struct queue_cuda *q, int wait ) { /* Wrap the index. */ ind %= cuda_queue_size; /* Loop until there is a valid task at that index. */ - while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0 && wait); + while ( q->rec_count < q->count && ( tid = q->data[ind] ) < 0); /* Scratch the task from the queue */ if ( tid >= 0 ) + { q->data[ind] = -1; - + atomicAdd((int*) &tot_num_tasks, -1); + } /* Return the acquired task ID. */ return tid; @@ -144,6 +147,8 @@ __device__ void cuda_queue_puttask ( struct queue_cuda *q , int tid ) { /* Write the task back to the queue. */ q->data[ind] = tid; + + atomicAdd((int*)&q->nr_avail_tasks, 1); } @@ -266,8 +271,6 @@ __device__ int cuda_locktask ( int tid ) { } -//TODO - /** * @brief Tell the #qsched that a task has completed. * @@ -279,27 +282,10 @@ __device__ void cuda_done ( struct task *t ) { int k; struct task *t2; - /* Release this task's locks. */ for ( k = 0 ; k < t->nr_locks ; k++ ) cuda_unlockres( t->locks[k] ); - - /* Loop over the task's unlocks... */ - //for ( k = 0 ; k < t->nr_unlocks ; k++ ) { - - /* Get a grip on the unlocked task. */ - // t2 = &tasks_cuda[ t->unlocks[k] ]; - - /* Is the unlocked task ready to run? */ - //if ( atomicAdd( &t2->wait, -1 ) == 1 && !( t2->flags & task_flag_skip ) ) - // cuda_queue_puttask( &cuda_queues[0] , t->unlocks[k] ); - - //} - - /* Set the task stats. */ - //t->toc = getticks(); - //t->cost = t->toc - t->tic; } @@ -314,16 +300,21 @@ __device__ void cuda_done ( struct task *t ) { * This routine blocks until a valid task is picked up, or the * specified queue is empty. */ -__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) { +__device__ int runner_cuda_gettask ( struct queue_cuda *q ) { int tid = -1; - + if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0) + { + atomicAdd((int*)&q->nr_avail_tasks, 1); + return -1; + } /* Main loop. */ - while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) { + while ( ( tid = cuda_queue_gettask( q ) ) >= 0 ) { if( cuda_locktask(tid) == 1 ) break; + atomicAdd((int*)&q->nr_avail_tasks, -1); cuda_queue_puttask ( q , tid ); } @@ -353,13 +344,17 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) { * This routine blocks until a valid task is picked up, or the * specified queue is empty. */ -__device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) { +__device__ int runner_cuda_gettask ( struct queue_cuda *q ) { int tid = -1; - + if( atomicAdd((int*)&q->nr_avail_tasks, -1) <= 0) + { + atomicAdd((int*)&q->nr_avail_tasks, 1); + return -1; + } /* Main loop. */ - while ( ( tid = cuda_queue_gettask( q , wait) ) >= 0 ) { + while ( ( tid = cuda_queue_gettask( q ) ) >= 0 ) { break; } @@ -380,6 +375,7 @@ __device__ int runner_cuda_gettask ( struct queue_cuda *q, int wait ) { __global__ void qsched_device_kernel ( ) { volatile __shared__ int tid; + volatile __shared__ int done; int *src, *dest; int i; @@ -389,27 +385,42 @@ __global__ void qsched_device_kernel ( ) __syncthreads(); if ( threadIdx.x == 0 ) { tid = -1; - if( cuda_queues[0].count == cuda_queues[0].rec_count ) - tid = runner_cuda_gettask ( &cuda_queues[1], 1); - else - tid = runner_cuda_gettask ( &cuda_queues[1], 0); + /* Highest priority queue, holds the unload tasks. */ + if(cuda_queues[2].nr_avail_tasks > 0 ) + tid = runner_cuda_gettask( &cuda_queues[2] ); + + /* Middle priority queue, contains user-specifed tasks. */ + if( tid < 0 && cuda_queues[0].nr_avail_tasks > 0 ) + tid = runner_cuda_gettask ( &cuda_queues[0]); - if( tid < 0 ) - tid = runner_cuda_gettask ( &cuda_queues[0], 1); + /* Low priority queue, contains the load tasks. */ + if( tid < 0 && cuda_queues[1].nr_avail_tasks > 0) + tid = runner_cuda_gettask ( &cuda_queues[1]); } /*Everyone wait for us to get a task id*/ __syncthreads(); - /* Exit if we didn't get a valid task. */ - if(tid < 0) + /* Exit if all tasks have been taken from queues and we don't have work to do. */ + if(tid < 0 && tot_num_tasks == 0) break; + #ifdef DEBUG_GPU + if(tid < 0 && cuda_queues[0].nr_avail_tasks == 0 && cuda_queues[1].nr_avail_tasks == 0 && cuda_queues[2].nr_avail_tasks == 0) + break; + #endif + /* If we couldn't find a task but some are not completed, try again. */ + if(tid < 0) + continue; - /*Start the task clock*/ - if( threadIdx.x == 0 ) - tasks_cuda[tid].tic = clock(); + /*Start the task clock*/ + if( threadIdx.x == 0 ){ + tasks_cuda[tid].blockID = blockIdx.x; + tasks_cuda[tid].tic = clock64(); + } + + /* Pick task type to do, if its not load, unload or ghost use the user supplied function. */ if( tasks_cuda[tid].type == type_load ) { int *d = (int*)&data_cuda[tasks_cuda[tid].data]; @@ -422,25 +433,31 @@ __global__ void qsched_device_kernel ( ) src = (int*)res_cuda[d[0]].gpu_data; dest = (int*)res_cuda[d[0]].data; cuda_memcpy_tasks( dest, src , res_cuda[d[0]].size, d[0]); - }else{ + }else if (tasks_cuda[tid].type != type_ghost ){ fun(tasks_cuda[tid].type , &data_cuda[tasks_cuda[tid].data]); } __syncthreads(); /*Stop the task clock*/ if( threadIdx.x == 0 ) - tasks_cuda[tid].toc = clock(); + tasks_cuda[tid].toc = clock64(); /*Unlocks*/ #ifdef GPU_locks if(threadIdx.x == 0) cuda_done( &tasks_cuda[tid] ); - // __syncthreads(); + __syncthreads(); #endif for(i = threadIdx.x; i < tasks_cuda[tid].nr_unlocks; i += blockDim.x ) { if( atomicSub( &tasks_cuda[tasks_cuda[tid].unlocks[i]].wait , 1 ) == 1 && !( tasks_cuda[tasks_cuda[tid].unlocks[i]].flags & task_flag_skip )) { - cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); + /* Place unloads into highest priority queue, any other task goes to normal priority queue. Load tasks are never unlocked.*/ + if(tasks_cuda[tasks_cuda[tid].unlocks[i]].type != type_unload) + { + cuda_queue_puttask( &cuda_queues[0] , tasks_cuda[tid].unlocks[i] ); + } + else + cuda_queue_puttask( &cuda_queues[2] , tasks_cuda[tid].unlocks[i] ); } } } @@ -449,23 +466,30 @@ __global__ void qsched_device_kernel ( ) /* Make a notch on the barrier, last one out cleans up the mess... */ __syncthreads(); if ( threadIdx.x == 0 ) - tid = ( atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ); + done = ( atomicAdd( &cuda_barrier , 1 ) == gridDim.x-1 ); __syncthreads(); - if ( tid ) { + if ( done ) { if ( threadIdx.x == 0 ) { cuda_barrier = 0; + + /* Reset the load task queue so load tasks are ready again.*/ volatile int *temp = cuda_queues[1].data; cuda_queues[1].data = cuda_queues[1].rec_data; cuda_queues[1].rec_data = temp; + + /* Reset values.*/ cuda_queues[0].first = 0; cuda_queues[0].last = 0; cuda_queues[0].rec_count = 0; cuda_queues[1].first = 0; - cuda_queues[1].last = 0; + cuda_queues[1].last = cuda_queues[1].count; cuda_queues[1].rec_count = 0; - + cuda_queues[2].first = 0; + cuda_queues[2].last = 0; + cuda_queues[2].rec_count = 0; } -//TODO + //TODO + /* Resetting waits is done on the CPU at this time. This may be changed in future.*/ /* for ( int j = threadIdx.x ; j < cuda_nr_tasks ; j+= blockDim.x ) for ( k = 0 ; k < tasks_cuda[j].nr_unlock ; k++ ) { @@ -479,7 +503,7 @@ __global__ void qsched_device_kernel ( ) int maxVal( int *array, int size ) { - int i, maxi=-32000; + int i, maxi=-3200000; for (i=0; i<size; i++) { if (array[i]>maxi) @@ -506,30 +530,31 @@ int minVal( int *array, int size ) void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, int parent, int *res, int *sorted ) { int i,j; + int task, utask; if(numChildren > 0 && size/numChildren > 128*sizeof(int)) { /* Create dummy task for this resource and recurse to its children!*/ - int task, utask; task = qsched_addtask( s, type_ghost, task_flag_none, NULL, 0 , 0 ); + qsched_adduse(s, task, ID); s->res[ID].task = task; utask = qsched_addtask( s , type_ghost, task_flag_none, NULL, 0 , 0 ); + qsched_adduse(s, task, ID); s->res[ID].task = utask; if(parent >= 0) { /* Create dependecy to parent. */ - qsched_addunlock(s, task, s->res[parent].task ); - qsched_addunlock(s, s->res[parent].utask, utask ); + /*qsched_addunlock(s, task, s->res[parent].task ); + qsched_addunlock(s, s->res[parent].utask, utask );*/ } for(i = sorted[ID]; i < sorted[ID+1]; i++) { qsched_create_loads(s, i, s->res[res[i]].size, sorted[i+1]-sorted[i], ID, res, sorted); } }else{ - int task,utask; task = qsched_addtask( s , type_load , task_flag_none, &ID, sizeof(int), 0 ); s->res[ID].task = task; utask = qsched_addtask( s , type_unload, task_flag_none, &ID, sizeof(int), 0 ); - s->res[ID].task = utask; + s->res[ID].utask = utask; /* Create load task for this resource and set its children to completed with this task.*/ for( j = sorted[ID]; j < sorted[ID+1]; j++ ) { @@ -538,15 +563,221 @@ void qsched_create_loads(struct qsched *s, int ID, int size, int numChildren, in /* If it has a parent then set the parents ghost task to be dependent on this.*/ if(parent >= 0) { - qsched_addunlock(s, task, s->res[parent].task ); - qsched_addunlock(s, s->res[parent].utask, utask); + /*qsched_addunlock(s, task, s->res[parent].task ); + qsched_addunlock(s, s->res[parent].utask, utask);*/ + } + } +} + +#define MAX_DEPTH 0 +int transitive_use_unlocks(struct qsched *s, struct task *t, int res, int depth) +{ + int i; + for(i = 0; i < t->nr_uses; i++) + { + if(t->uses[i] == res) + return 1; + } + for(i = 0; i < t->nr_locks; i++) + { + if(t->locks[i] == res) + return 1; + } + if(depth >= MAX_DEPTH) + { +// printf("Max Depth reached\n"); + return 0; + } + for(i = 0; i < t->nr_unlocks; i++) + { + if(transitive_use_unlocks(s, &s->tasks[t->unlocks[i]], res, depth + 1)) + return 1; + } + return 0; +} + +int transitive_use_locks(struct qsched *s, int tid, int res, int depth) +{ + int i,j; + struct task *new_t; + struct task *t = &s->tasks[tid]; + for(i = 0; i < t->nr_uses; i++) + { + if(t->uses[i] == res) + return 1; + } + for(i = 0; i < t->nr_locks; i++) + { + if(t->locks[i] == res) + return 1; + } + if(depth >= MAX_DEPTH) + { +// printf("Max Depth reached\n"); + return 0; + } + for(i = tid-1; i >= 0; i--) + { + new_t = &s->tasks[i]; + for(j = 0; j < new_t->nr_unlocks; j++) + { + if(new_t->unlocks[j] == tid) + { + if(transitive_use_locks(s, i, res , depth + 1)) + return 1; + break; + } } } + return 0; +} + +/* +* Not used. +*/ +void qsched_prepare_deps( struct qsched *s ) +{ + int **is_loaded; + int **parents; + int i, k, j, use, usek, usem; + int *num_parents; + is_loaded = (int**)malloc(sizeof(int*) * s->count); + parents = (int**)malloc(sizeof(int*) * s->count); + num_parents = (int*)malloc(sizeof(int) * s->count); + bzero(num_parents, sizeof(int)*s->count); + k = (sizeof(int)*s->count_res)/32 +1; + for(i = 0; i < s->count; i++) + { + is_loaded[i] = (int*)malloc(k); + bzero(is_loaded[i], k); + } + /* Is loaded[i][k] gives the set of k*32 resources for task i*/ + + /* Reset the waits to 0... */ + for( k = 0; k < s->count; k++ ) + { + s->tasks[k].wait = 0; + } + + /* Run through the tasks and set the waits... */ + for ( k = 0 ; k < s->count ; k++ ) { + struct task *t = &s->tasks[k]; + if ( !( t->flags & task_flag_skip ) ) + for ( j = 0 ; j < t->nr_unlocks ; j++ ) + s->tasks[ t->unlocks[j] ].wait += 1; + } + + /* Sort the tasks topologically. */ + int *tid = (int *)malloc( sizeof(int) * s->count ); + for ( j = 0 , k = 0 ; k < s->count ; k++ ) + if ( s->tasks[k].wait == 0 ) { + tid[j] = k; + j += 1; + } + for ( k = 0 ; k < j ; k++ ) { + struct task *t = &s->tasks[ tid[k] ]; + for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) + if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { + tid[j] = t->unlocks[kk]; + j += 1; + } + } + if ( k < s->count ) + { + error( "Circular dependencies detected." ); + } + int max_parents = 0; + for(i = s->count-1; i >= 0; i--) + { + for(j = 0; j < s->tasks[i].nr_unlocks; j++) + { + num_parents[s->tasks[i].unlocks[j]]++; + } + } + for(i = 0; i < s->count; i++) + { + if(num_parents[i] > 0) + parents[i] = (int*)calloc(num_parents[i],sizeof(int)); + else + parents[i] = NULL; + + if(num_parents[i] > max_parents) + { + max_parents = num_parents[i]; + } + + num_parents[i] = 0; + } + + for(i = 0; i < s->count; i++) + { + if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) + continue; + for(k = 0; k < s->tasks[i].nr_uses; k++) + { + use = s->tasks[i].uses[k]; + usek = use >> 5; // use / 32; + usem = use & 31; // use % 32. + + if((is_loaded[i][usek] & (1 << (31-usem))) == 0 ) + { + qsched_addunlock(s, s->res[use].task , i ) ; + is_loaded[i][usek] |= (1 <<(31-usem)); + } + } + for(k = 0; k < s->tasks[i].nr_unlocks; k++) + { + if(s->tasks[s->tasks[i].unlocks[k]].type == type_load || + s->tasks[s->tasks[i].unlocks[k]].type == type_unload ) + continue; + for(j = 0; j < s->count_res/32 +1; j++) + { + is_loaded[s->tasks[i].unlocks[k]][j] |= is_loaded[i][j]; + } + parents[s->tasks[i].unlocks[k]][num_parents[s->tasks[i].unlocks[k]]] = i; + num_parents[s->tasks[i].unlocks[k]] = num_parents[s->tasks[i].unlocks[k]] + 1; + } + } + max_parents = 0; + for(i = 0; i < s->count; i++) + { + if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) + continue; + if(num_parents[i] > max_parents) + { + max_parents = num_parents[i]; + } + bzero(is_loaded[i], k); + } + for(i = s->count-1; i >= 0; i--) + { + if(s->tasks[i].type == type_load || s->tasks[i].type == type_unload) + continue; + for(k = 0; k < s->tasks[i].nr_uses; k++) + { + use = s->tasks[i].uses[k]; + usek = use >> 5; // use / 32; + usem = use & 31; // use % 32. + if((is_loaded[i][usek] & (1 << (31-usem))) == 0 ) + { + qsched_addunlock(s, i, s->res[use].utask ); + is_loaded[i][usek] |= (1 << (31-usem)); + } + } + for(k = 0; k < num_parents[i]; k++) + { + for(j = 0; j < s->count_res/32 +1; j++) + { + is_loaded[parents[i][k]][j] |= is_loaded[i][j]; + } + } + } + } void qsched_prepare_loads ( struct qsched *s ) { -int i, task, unload, j , k , unlocked = 0; +int i, task, unload, j , k , x, unlocked = 0; struct task *t; int *sorted, lastindex; int *res, *res_data; @@ -556,6 +787,89 @@ if(s->res[0].task != -1) printf("Tasks already initialised, not redoing load/unload tasks"); return; } +double itpms = 1000.0 / CPU_TPS; +ticks tic, toc_run ; + tic = getticks(); + +/* Expand the deps array so we can add new dependencies in place. */ +qsched_task_t *deps_new, *deps_new_key; +s->size_deps *= qsched_stretch; + + + +j = 0; +//printf("%i\n", s->tasks[94].unlocks); +// for(k = 0; k < s->tasks[94].nr_unlocks; k++) +// printf("%i ", s->tasks[94].unlocks[k]); +// printf("\n"); + +/* Allocate a new dependency list. */ +if ( ( deps_new = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL || + ( deps_new_key = (int*) malloc( s->size_deps * sizeof(int) ) ) == NULL ) + error( "Failed to allocate new deps lists." ); + +/* Copy the dependency list to the new list, leaving a space between each task to fit unload dependencies in.*/ +for(i = 0; i < s->count; i++) +{ + t = &s->tasks[i]; + /* Its possible we might not fit in weird cases so we need to make sure to expand if needed.*/ + if(j + t->nr_unlocks + t->nr_uses + t->nr_locks > s->size_deps) + { + // printf("\n"); + qsched_task_t *temp1, *temp2; + s->size_deps *= qsched_stretch; + /* Allocate a new task list. */ + if ( ( temp1 = (int*) malloc(s->size_deps * sizeof(int) ) ) == NULL || + ( temp2 = (int*) malloc( s->size_deps * sizeof(int) ) ) == NULL ) + error( "Failed to allocate new deps lists." ); + /* Copy the deps and keys over to the new list. */ + memcpy( temp1 , deps_new , sizeof(int) * /*s->count_deps*/ j); + memcpy( temp2 , deps_new_key , sizeof(int) * /*s->count_deps*/ j ); + int m; + /* Have to update since we aren't resorting later.*/ + for(m = 0; m < i; m++) + { + + t = &s->tasks[m]; + t->unlocks = &temp1[t->unlocks - deps_new]; + } + free(deps_new); + free(deps_new_key); + deps_new = temp1; + deps_new_key = temp2; +// printf("Stretch at line 828. m = %i\n", m); + t = &s->tasks[i]; + } + int start_j = j; + + /*if(i <= 94) +{ + printf("%i\n", s->tasks[94].unlocks); + for(k = 0; k < s->tasks[94].nr_unlocks; k++) + printf("%i ", s->tasks[94].unlocks[k]); + printf(" i = %i\n", i); + printf("%i\n", t->unlocks); +}*/ + for(k = 0; k < t->nr_unlocks; k++) + { + deps_new[j] = t->unlocks[k]; + //if(i == 48) + // printf("%i %i %i %i\n", t->unlocks +k,s->deps_key, s->deps, s->count_deps); + deps_new_key[j] = s->deps_key[&t->unlocks[k] - s->deps]; + j++; + } + t->unlocks = &deps_new[start_j]; + /*if(i == 94) +{ + for(k = 0; k < t->nr_unlocks; k++) + printf("%i ", t->unlocks[k]); + printf("\n"); +}*/ + j+=t->nr_uses + t->nr_locks; +} + +s->count_deps = j; + /* Store number of children for each resource*/ sorted = (int*) malloc(sizeof(int) * (s->count_res+1)); @@ -601,12 +915,13 @@ for(i = 0; i < s->count_res; i++) mini = res_data[i]; // printf("%i ", res_data[i]); } + for(i = 0; i < s->count_res; i++) { res_data[i] -= mini; -printf("%i ", res_data[i]); +//printf("%i ", res_data[i]); } -printf("\n"); +//printf("\n"); /* Sort the children of each parent by memory address. */ qsched_sort(res, res_data, sorted[0], minVal(res_data, sorted[0]), maxVal(res_data, sorted[0])); @@ -621,13 +936,13 @@ for(i = 1; i < s->count_res; i++) qsched_sort(&res[sorted[s->count_res-1]], &res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1], minVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1]), maxVal(&res_data[sorted[s->count_res-1]], s->count_res - sorted[s->count_res-1])); -/*for(i = 0; i < s->count_res; i++) +/* +for(i = 0; i < s->count_res; i++) { printf("%i ", res_data[i]); } printf("\n"); - for(i = 0; i < s->count_res; i++) { printf("%i ", res[i]); @@ -657,17 +972,20 @@ for(i = 1; i < s->count_res; i++) { for(j = sorted[i-1]; j < sorted[i]-1; j++) { - if(res_data[j] + s->res[res[j]].size > res_data[j+1]) + if(res_data[j] + (s->res[res[j]].size/sizeof(int)) > res_data[j+1]) error("Overlapping resources are not allowed."); } } } /* Check super resources don't overlap.*/ -for( i = sorted[s->count_res-1]; i < s->count_res; i++ ) +for( i = sorted[s->count_res-1]; i < s->count_res-1; i++ ) { - if(res_data[i] + s->res[res[i]].size > res_data[i+1]) + if(res_data[i] + (s->res[res[i]].size/sizeof(int)) > res_data[i+1]) + { + /*printf("i=%i, s->count_res=%i, res_data[i] = %i, size = %i, res_data[i]+size = %i, res_data[i+1] = %i\n",i, s->count_res, res_data[i], s->res[res[i]].size, res_data[i] + (s->res[res[i]].size/sizeof(int)), res_data[i+1]);*/ error("Overlapping resources are not allowed."); + } } /* Reposition sorted pointers so that sorted[i] points to the first child of task ID= i*/ @@ -675,6 +993,11 @@ for(i = sorted[s->count_res]; i >= 0; i-- ) { sorted[i] = sorted[i-1]; } + +toc_run = getticks(); +// message( "Sorting took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + +tic = getticks(); /* If nothing overlaps create tasks.*/ for( i = sorted[s->count_res]; i < s->count_res; i++ ) { @@ -688,7 +1011,11 @@ for( i = sorted[s->count_res]; i < s->count_res; i++ ) } -/* Check all tasks have load tasks - if not give parents (recursively)*/ +toc_run = getticks(); +// message( "Creating load tasks took %.3f ms" , ((double)(toc_run - tic)) * itpms ); + + +/* Check all resources have load tasks - if not give parents (recursively)*/ for(i = 0; i < s->count_res; i++ ) { if( s->res[i].task == -1 ) @@ -714,41 +1041,355 @@ for(i = 0; i < s->count_res; i++ ) } } +int **usage_list; +int *num_uses; +int *size_uses; +usage_list = (int**)malloc(sizeof(int*) * s->count_res); +num_uses = (int*) malloc(sizeof(int) * s->count_res ); +size_uses = (int*) malloc(sizeof(int) * s->count_res); +for(i = 0; i < s->count_res; i++ ) +{ + usage_list[i] = (int*) malloc(sizeof(int) * s->count_uses / s->count_res + 1); + num_uses[i] = 0; + size_uses[i] = s->count_uses / s->count_res + 1; +} +/* Add deps from tasks to unload tasks. */ +for(i = 0; i < s->count; i++) +{ + t = &s->tasks[i]; + if(t-> type == type_unload || t->type == type_load || t->type == type_ghost ) + continue; -/* Set up dependencies with the rest of the system.*/ -for( i = 0; i < s->count_res; i++ ) + for(j = 0; j < t->nr_uses; j++) + { + t->unlocks[t->nr_unlocks] = s->res[t->uses[j]].utask; + deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i; + t->nr_unlocks++; + if(num_uses[t->uses[j]] == size_uses[t->uses[j]]) + { + /* Stretch. */ + int* temp = (int*) malloc(sizeof(int) * size_uses[t->uses[j]] * 2 ); + memcpy( temp, usage_list[t->uses[j]], sizeof(int) * size_uses[t->uses[j]]); + free(usage_list[t->uses[j]]); + usage_list[t->uses[j]] = temp; + size_uses[t->uses[j]] *=2; + } + usage_list[t->uses[j]][num_uses[t->uses[j]]++] = i; + } + for(j = 0; j < t->nr_locks; j++) + { + t->unlocks[t->nr_unlocks] = s->res[t->locks[j]].utask; + deps_new_key[(t->unlocks - deps_new) + t->nr_unlocks] = i; + t->nr_unlocks++; + if(num_uses[t->locks[j]] == size_uses[t->locks[j]]) + { + /* Stretch. */ + int* temp = (int*) malloc(sizeof(int) * size_uses[t->locks[j]] * 2 ); + memcpy( temp, usage_list[t->locks[j]], sizeof(int) * size_uses[t->locks[j]]); + free(usage_list[t->locks[j]]); + usage_list[t->locks[j]] = temp; + size_uses[t->locks[j]] *=2; + } + usage_list[t->locks[j]][num_uses[t->locks[j]]++] = i; + } +} + +/* Loop through resources. */ + +// TODO Make sure to stretch if needed. +for(i = 0; i < s->count_res; i++ ) { - int unlocked = 0; - for( j = 0; j < s->count; j++ ) + int ID = res[i]; + int size = s->res[ID].size; + int numChildren = sorted[ID+1] - sorted[ID]; + int parent = s->res[ID].parent; + struct res *resource = &s->res[ res[i] ]; + + /* Loop through children if there are any. */ + if(numChildren > 0) { - struct task *t = &s->tasks[j]; - for(k = 0; k < t->nr_uses; k++) + /* Do unload task stuff first. */ + s->tasks[resource->utask].unlocks = &deps_new[s->count_deps]; + s->tasks[resource->utask].nr_unlocks = 0; + if(s->count_deps + numChildren > s->size_deps) { - if(t->uses[k] == i) + qsched_task_t *temp1, *temp2; + s->size_deps *= qsched_stretch; + /* Allocate a new task list. */ + if ( ( temp1 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL || + ( temp2 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL ) + error( "Failed to allocate new deps lists." ); + /* Copy the deps and keys over to the new list. */ + memcpy( temp1 , deps_new , sizeof(int) * s->count_deps ); + memcpy( temp2 , deps_new_key , sizeof(int) * s->count_deps ); + int m; + /* Have to update since we aren't resorting later.*/ + for(m = 0; m < s->count; m++) { - qsched_addunlock(s, s->res[i].task ,j); - qsched_addunlock(s, j, s->res[i].utask); - unlocked = 1; - break; + t = &s->tasks[m]; + t->unlocks = &temp1[t->unlocks - deps_new]; } + free(deps_new); + free(deps_new_key); + deps_new = temp1; + deps_new_key = temp2; +// printf("Stretch at line 1102, m = %i.\n", m); } - if(unlocked == 1) + for(j = 0; j < numChildren; j++) { - unlocked = 0; - continue; + struct res *child = &s->res[ res[sorted[ ID ]+j] ]; + + if( child->utask != resource->utask ) + { + s->tasks[resource->utask].unlocks[ s->tasks[resource->utask].nr_unlocks ] = child->utask; + deps_new_key[s->count_deps] = resource->utask; + s->tasks[resource->utask].nr_unlocks += 1; + s->count_deps += 1; + } } - for(k = 0; k < t->nr_locks; k++) + } + + /* Do load task stuff. */ + + s->tasks[resource->task].unlocks = &deps_new[s->count_deps]; + s->tasks[resource->task].nr_unlocks = 0; + if(numChildren > 0) + { + for(j = 0; j < numChildren; j++) { - if(t->locks[k] == i) + struct res *child = &s->res[ res[sorted[ ID ]+j] ]; + if( child->utask == resource->utask ) { - qsched_addunlock(s, s->res[i].task ,j); - qsched_addunlock(s, j, s->res[i].utask); - break; + if( s->size_deps < s->count_deps + num_uses[res[ sorted[ ID ] + j ] ] ) + { + qsched_task_t *temp1, *temp2; + s->size_deps *= qsched_stretch; + /* Allocate a new task list. */ + if ( ( temp1 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL || + ( temp2 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL ) + error( "Failed to allocate new deps lists." ); + /* Copy the deps and keys over to the new list. */ + memcpy( temp1 , deps_new , sizeof(int) * s->count_deps ); + memcpy( temp2 , deps_new_key , sizeof(int) * s->count_deps ); + int m; + /* Have to update since we aren't resorting later.*/ + for(m = 0; m < s->count; m++) + { + t = &s->tasks[m]; + t->unlocks = &temp1[t->unlocks - deps_new]; + } + free(deps_new); + free(deps_new_key); + deps_new = temp1; + deps_new_key = temp2; +// printf("Stretch at line 1151.\n"); + } + for(k = 0; k < num_uses[ res[ sorted[ ID ] + j ] ]; k++) + { + s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ sorted[ ID ] +j ] ][k]; + deps_new_key[s->count_deps] = resource->task; + s->tasks[resource->task].nr_unlocks += 1; + s->count_deps += 1; + } } } } + if( s->size_deps < s->count_deps + 1 + num_uses[ res[ i ] ]) + { + qsched_task_t *temp1, *temp2; + s->size_deps *= qsched_stretch; + /* Allocate a new task list. */ + if ( ( temp1 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL || + ( temp2 = (int*) malloc( sizeof(int) * s->size_deps ) ) == NULL ) + error( "Failed to allocate new deps lists." ); + /* Copy the deps and keys over to the new list. */ + memcpy( temp1 , deps_new , sizeof(int) * s->count_deps ); + memcpy( temp2 , deps_new_key , sizeof(int) * s->count_deps ); + int m; + /* Have to update since we aren't resorting later.*/ + for(m = 0; m < s->count; m++) + { + t = &s->tasks[m]; + t->unlocks = &temp1[t->unlocks - deps_new]; + } + free(deps_new); + free(deps_new_key); + deps_new = temp1; + deps_new_key = temp2; +// printf("Stretch at line 1185.\n"); + } + if( parent > 0 ) + { + s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = s->res[parent].task; + deps_new_key[s->count_deps] = resource->task; + s->tasks[resource->task].nr_unlocks += 1; + s->count_deps += 1; + } + + for(k = 0; k < num_uses[ res[ i ] ] ; k ++ ) + { + s->tasks[resource->task].unlocks[ s->tasks[resource->task].nr_unlocks ] = usage_list[ res[ i ] ][k]; + deps_new_key[s->count_deps] = resource->task; + s->tasks[resource->task].nr_unlocks += 1; + s->count_deps += 1; + } + } +//printf("s->deps = %i, deps_new = %i\n", s->deps, deps_new); +free(s->deps); +free(s->deps_key); +s->deps = deps_new; +s->deps_key = deps_new_key; +s->flags &= ~qsched_flag_dirty; +tic = getticks(); +//printf("Number tasks = %i\n", s->count); +//printf("Number dependencies = %i\n", s->count_deps); +/* Set up dependencies with the rest of the system.*/ + + + +/*New version*/ + /* Reset the waits to 0... */ +/* for( k = 0; k < s->count; k++ ) + { + s->tasks[k].wait = 0; + } */ + + /* Run through the tasks and set the waits... */ +/* for ( k = 0 ; k < s->count ; k++ ) { + struct task *t = &s->tasks[k]; + if ( !( t->flags & task_flag_skip ) ) + for ( j = 0 ; j < t->nr_unlocks ; j++ ) + s->tasks[ t->unlocks[j] ].wait += 1; + } + */ + /* Sort the tasks topologically. */ +/* int *tid = (int *)malloc( sizeof(int) * s->count ); + for ( j = 0 , k = 0 ; k < s->count ; k++ ) + if ( s->tasks[k].wait == 0 ) { + tid[j] = k; + j += 1; + } + for ( k = 0 ; k < j ; k++ ) { + struct task *t = &s->tasks[ tid[k] ]; + for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) + if ( ( s->tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { + tid[j] = t->unlocks[kk]; + j += 1; + } + } + if ( k < s->count ) + { + //printf("k = %i, count = %i\n", k, count); + error( "Circular dependencies detected." ); + } + +*/ +/*Do unlocks */ +/*for(i = 0; i < s->count; i++) +{ + struct task *t = &s->tasks[i]; + int result = 0; + if(t->type == type_ghost || t->type == type_unload || t->type == type_load) + continue; + for(j = 0; j < t->nr_uses; j++) + { + result = 0; + for(k = 0; k < t->nr_unlocks && result == 0; k++) + { + result = transitive_use_unlocks(s, &s->tasks[t->unlocks[k]], t->uses[j],0); + + + } + if( result == 0) + qsched_addunlock(s, i, s->res[t->uses[j]].utask); + } + for(j = 0; j < t->nr_locks; j++) + { + result = 0; + for(k = 0; k < t->nr_unlocks && result == 0; k++) + { + result =transitive_use_unlocks(s, &s->tasks[t->unlocks[k]], t->locks[j],0); + + } + if(result == 0) + qsched_addunlock(s, i, s->res[t->locks[j]].utask); + } +}*/ + +/*Do locks */ +/*for(i = s->count-1; i >= 0; i--) +{ + struct task *t = &s->tasks[i]; + struct task *new_t; + int result = 0; + if(t->type == type_ghost || t->type == type_unload || t->type == type_load) + continue; + for(j = 0; j < t->nr_uses; j++) + { + result = 0; + for(k = i-1; k >= 0; k--) + { + new_t = &s->tasks[k]; + for(x = 0; x < new_t->nr_unlocks && result == 0; x++) + { + if(new_t->unlocks[x] == i) + { + result = transitive_use_locks(s, k, t->uses[j],0); + } + } + } + if(result == 0) + { + qsched_addunlock(s, s->res[t->uses[j]].task, i); + } + } + for(j = 0; j < t->nr_locks; j++) + { + result = 0; + for(k = i-1; k >= 0; k--) + { + new_t = &s->tasks[k]; + for(x = 0; x < new_t->nr_unlocks && result == 0; x++) + { + if(new_t->unlocks[x] == i) + { + result = transitive_use_locks(s, k, t->locks[j],0); + } + } + } + if(result == 0) + { + qsched_addunlock(s, s->res[t->locks[j]].task, i); + } + } + +}*/ + +/* Old version*/ +/*for(i = 0; i < s->count; i++) +{ + struct task *t = &s->tasks[i]; + if(t->type == type_load || t->type == type_unload || t-> type == type_ghost) + continue; + + for(k = 0; k < t->nr_uses; k++) + { + qsched_addunlock(s, s->res[t->uses[k]].task, i); + qsched_addunlock(s, i, s->res[t->uses[k]].utask); + } + + for(k = 0; k < t->nr_locks; k++) + { + qsched_addunlock(s, s->res[t->locks[k]].task, i); + qsched_addunlock(s, i, s->res[t->locks[k]].utask); + } +}*/ +//qsched_prepare_deps( s ); +//printf("Number dependencies = %i\n", s->count_deps); +toc_run = getticks(); +// message( "Setting up dependencies took %.3f ms" , ((double)(toc_run - tic)) * itpms ); //error("Got to here"); } @@ -770,7 +1411,6 @@ for(i = 0; i < s->count_res; i++) // cudaMalloc( &s->res[ i ].gpu_data, s->res[i].size ); task = qsched_addtask( s , type_load , task_flag_none , &i , sizeof(int) , 0 ); s->res[i].task = task; - printf("s->res[i].task = %i\n", s->res[i].task); unload = qsched_addtask( s , type_unload, task_flag_none , &i, sizeof(int), 0 ); /*Load task unlocks each task that uses or locks the specified resource */ /*Unload task is unlocked by each task that is unlocked by the load task. */ @@ -869,11 +1509,26 @@ char *sdata; /* All cleaned-up now! */ //s->flags &= ~qsched_flag_dirty; + } -qsched_prepare_loads(s); + double itpms = 1000.0 / CPU_TPS; +ticks tic, toc_run ; + tic = getticks(); +//qsched_ensure(s, s->count + 2*s->count_res, s->count_res, s->count_deps + 2*s->count_res, s->count_locks, s->count_uses + 2*s->count_res, s->size_data ); + for ( k = 0 ; k < count ; k++ ) { + t = &tasks[k]; + if ( !( t->flags & task_flag_skip ) ) + for ( j = 0 ; j < t->nr_unlocks ; j++ ) + { + tasks[ t->unlocks[j] ].wait += 1; + } + } +qsched_prepare_loads(s); +toc_run = getticks(); +// message( "prepare_loads took %.3f ms" , ((double)(toc_run - tic)) * itpms ); @@ -887,22 +1542,22 @@ qsched_prepare_loads(s); if ( s->flags & qsched_flag_dirty ) { /* Do the sorts in parallel, if possible. */ - #pragma omp parallel - { + // #pragma omp parallel + //{ /* Sort the unlocks. */ - #pragma omp single nowait - qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 ); + //#pragma omp single nowait + // qsched_sort( s->deps , s->deps_key , s->count_deps , 0 , count - 1 ); /* Sort the locks. */ - #pragma omp single nowait - qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); + // #pragma omp single nowait + // qsched_sort( s->locks , s->locks_key , s->count_locks , 0 , count - 1 ); /* Sort the uses. */ - #pragma omp single nowait - qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 ); + // #pragma omp single nowait + // qsched_sort( s->uses , s->uses_key , s->count_uses , 0 , count - 1 ); - } + // } /* Run throught the tasks and link the locks and unlocks. */ tasks[0].unlocks = s->deps; @@ -918,7 +1573,6 @@ qsched_prepare_loads(s); s->flags &= ~qsched_flag_dirty; } - /* Init the queues. */ for ( k = 0 ; k < s->nr_queues ; k++ ) queue_init( &s->queues[k] , count ); @@ -929,46 +1583,107 @@ qsched_prepare_loads(s); tasks[k].wait = 0; } + int* store_waits; + if( (store_waits = (int*) malloc(sizeof(int)*s->count) ) == NULL) + error("Failed to allocate store_waits"); + /* Run through the tasks and set the waits... */ for ( k = 0 ; k < count ; k++ ) { t = &tasks[k]; if ( !( t->flags & task_flag_skip ) ) for ( j = 0 ; j < t->nr_unlocks ; j++ ) + { tasks[ t->unlocks[j] ].wait += 1; + } } - /* Sort the tasks topologically. */ int *tid = (int *)malloc( sizeof(int) * count ); for ( j = 0 , k = 0 ; k < count ; k++ ) + { + store_waits[k] = tasks[k].wait; if ( tasks[k].wait == 0 ) { tid[j] = k; j += 1; } + } for ( k = 0 ; k < j ; k++ ) { t = &tasks[ tid[k] ]; for ( int kk = 0 ; kk < t->nr_unlocks ; kk++ ) + { if ( ( tasks[ t->unlocks[kk] ].wait -= 1 ) == 0 ) { tid[j] = t->unlocks[kk]; j += 1; } } + } + + /* Print all dependencies */ + /*for(i = 0; i < count; i++ ) + { + printf("Task ID: %i, type=%i, ", i, tasks[i].type); + for(j = 0; j < tasks[i].nr_unlocks; j++) + { + printf("%i ", tasks[i].unlocks[j]); + } + printf("\n"); + }*/ if ( k < count ) + { + printf("k = %i, wait = %i\n", tid[k-1], tasks[tid[k-1]].wait); + printf("tasks[0].nr_unlocks=%i\n", tasks[0].nr_unlocks); + for(i = 0; i < k-1; i++) + { + t = &tasks[tid[i]]; + for(j = 0; j < t->nr_unlocks; j++) + { + if(t->unlocks[j] == tid[k-1]) + printf("%i ", tid[i]); + + } + } + printf("\n"); + for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++) + { + printf("%i ", tasks[tid[k-1]].unlocks[i]); + } + printf("\n"); + for(i = 0; i < tasks[tid[k-1]].nr_unlocks; i++) + { + printf("unlocks[%i] = %i wait = %i\n", i, tasks[tid[k-1]].unlocks[i], tasks[tasks[tid[k-1]].unlocks[i]].wait); + for(j = 0; j < tasks[tasks[tid[k-1]].unlocks[i]].nr_unlocks; j++) + { + printf("%i ", tasks[tasks[tid[k-1]].unlocks[i]].unlocks[j]); + } + printf("\n"); + } + printf("\n\n"); + for(i = 0; i < k; i++) + { + if(tasks[tid[i]].type == 0) + { + int* idata = (int*)&(s->data[tasks[tid[i]].data]); + printf("%i %i,", tid[i], idata[0]); + } + } + printf("\n"); error( "Circular dependencies detected." ); - + } /* Run through the topologically sorted tasks backwards and set their weights, re-setting the waits while we're at it. */ for ( k = count-1 ; k >= 0 ; k-- ) { int maxweight = 0; t = &tasks[ tid[k] ]; + t->wait = store_waits[tid[k]]; for ( j = 0 ; j < t->nr_unlocks ; j++ ) { - tasks[ t->unlocks[j] ].wait += 1; if ( tasks[ t->unlocks[j] ].weight > maxweight ) maxweight = tasks[ t->unlocks[j] ].weight; } t->weight = t->cost + maxweight; } + free(store_waits); + /*Allocate temporary tasks to setup device tasks*/ temp = (struct task *) malloc(s->count * sizeof(struct task)); @@ -1023,16 +1738,6 @@ if( cudaMemcpyToSymbol ( uses_cuda, &setup_u , sizeof(qsched_res_t *), 0 , cud if( cudaMalloc( &res_t , sizeof(struct res) * s->count_res ) != cudaSuccess ) error("Failed to allocated on the device."); -for(i = 0; i < s->count_res; i++) -{ - // if(s->res[i].size == 0) - // continue; - //if( cudaMalloc( &data, s->res[i].size) != cudaSuccess ) - // error("Failed to allocate data space on the device."); - //s->res[i].gpu_data = data; -} - - if( cudaMemcpy( res_t , s->res , sizeof(struct res) * s->count_res , cudaMemcpyHostToDevice) != cudaSuccess ) error("Failed to copy resources to the device: %s", cudaGetErrorString(cudaPeekAtLastError())); if( cudaMemcpyToSymbol( res_cuda , &res_t , sizeof(struct res *) , 0 , cudaMemcpyHostToDevice) != cudaSuccess ) @@ -1056,21 +1761,9 @@ if( cudaMemcpy( cuda_t, temp, sizeof(struct task) * s->count , cudaMemcpyHostToD if( cudaMemcpyToSymbol ( tasks_cuda, &cuda_t , sizeof(struct task *) , 0 , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy task pointer to the device."); - - - -/*for(i = 0; i < s->count; i++) -{ - printf("Task:%i of type %i has wait %i\n", i, temp[i].type, temp[i].wait); -}*/ - - - - - /* Initialize the queues. */ -int nr_queues= 2,qsize; -//int *data; +int nr_queues= 3,qsize; +int *data2; struct queue_cuda queues[ cuda_numqueues ]; qsize = max(2*s->count / nr_queues, 256); @@ -1080,14 +1773,21 @@ struct queue_cuda queues[ cuda_numqueues ]; /* Allocate a temporary buffer for the queue data. */ if ( ( data = (int *)malloc( sizeof(int) * qsize ) ) == NULL ) error("Failed to allocate data buffer."); + if( ( data2 = (int *) malloc( sizeof(int) * qsize ) ) == NULL ) + error("Failed to allocate data2 buffer."); queues[1].count = 0; for(i = 0; i < s->count; i++) { if(temp[i].type == type_load && temp[i].wait == 0) + { data[queues[1].count++] = i; + data2[queues[1].count-1] = -temp[i].weight; + } } + qsched_sort(data, data2, queues[1].count, minVal(data2,queues[1].count), maxVal(data2, queues[1].count)); + free(data2); for ( k = queues[1].count ; k < qsize ; k++ ) data[k] = -1; /* Allocate and copy the data. */ @@ -1109,24 +1809,57 @@ struct queue_cuda queues[ cuda_numqueues ]; error("Failed to allocate queue data on the device."); if ( cudaMemcpy( (void *)queues[0].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) error("Failed to copy queue data pointer to the device"); +/* Allocate and copy the recyling data. */ + if ( cudaMalloc( &queues[2].rec_data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[2].rec_data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); /* Set some other values. */ queues[1].first = 0; queues[1].last = queues[1].count; + queues[1].nr_avail_tasks = queues[1].last; queues[1].rec_count = 0; + /* Init queue 2*/ + int num_unload=0; + queues[2].count = 0; + for( k = 0; k < s->count; k++ ) + { + if(temp[k].type == type_unload) + { + num_unload++; + if(temp[k].wait == 0) + data[queues[2].count++] = k; + } + } + queues[2].first = 0; + queues[2].last = queues[2].count; + queues[2].nr_avail_tasks = queues[2].last; + queues[2].count = num_unload; + queues[2].rec_count = 0; + /* Allocate and copy the data. */ + if ( cudaMalloc( &queues[2].data , sizeof(int) * qsize ) != cudaSuccess ) + error("Failed to allocate queue data on the device."); + if ( cudaMemcpy( (void *)queues[2].data , data , sizeof(int) * qsize , cudaMemcpyHostToDevice ) != cudaSuccess ) + error("Failed to copy queue data pointer to the device"); + + for ( k = 0; k < qsize; k++ ) + data[k] = -1; + /* Init queue 0*/ queues[0].count = 0; for ( k = 0; k < s->count ; k++ ) { - if(temp[k].type != type_load && temp[k].wait == 0){ + if(temp[k].type != type_load && temp[k].type != type_unload && temp[k].wait == 0){ data[queues[0].count++] = k; } } queues[0].first = 0; queues[0].last = queues[0].count; - queues[0].count = s->count - queues[1].count; + queues[0].nr_avail_tasks = queues[0].last; + queues[0].count = s->count - queues[1].count - queues[2].count; queues[0].rec_count = 0; @@ -1142,7 +1875,6 @@ struct queue_cuda queues[ cuda_numqueues ]; /* Clean up. */ free( tid ); - printf("Queue[0].count = %i , queue[1].count = %i\n", queues[0].count, queues[1].count); /* Set the number of waiting tasks. */ s->waiting = count; @@ -1152,8 +1884,6 @@ struct queue_cuda queues[ cuda_numqueues ]; free( temp ); free( data ); - /* Unlock the sched. */ - //lock_unlock_blind( &s->lock ); } @@ -1175,40 +1905,36 @@ struct task* qsched_get_timers( struct qsched *s, int numtasks ) return cuda_tasks; } -/*void qsched_get_CUDA_tasks ( struct qsched *s , struct task *cuda_tasks , int numtasks ) -{ - -}*/ - /** * @brief Execute all the tasks in the current scheduler using - * OpenMP. + * CUDA. * * @param s Pointer to the #qsched. * @param fun User-supplied function that will be called with the * task type and a pointer to the task data. This must be a __device__ function! * * This function is only available if QuickSched was compiled with - * OpenMP support. + * CUDA support. */ void qsched_run_CUDA ( struct qsched *s, qsched_funtype func) { #ifdef WITH_CUDA + ProfilerStart("/home/aidan/quicksched-code/examples/profiler.out"); double itpms = 1000.0 / CPU_TPS; ticks tic, toc_run ; tic = getticks(); qsched_prepare_cuda( s ); toc_run = getticks(); - message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms ); +// message( "prepare_cuda took %.3f ms" , ((double)(toc_run - tic)) * itpms ); cudaMemcpyToSymbol( fun , &func , sizeof(qsched_funtype)); tic = getticks(); - qsched_device_kernel<<<1, 128 >>> ( ); + ProfilerStop(); + cudaMemcpyToSymbol( tot_num_tasks, &s->count, sizeof(int) ); + qsched_device_kernel<<<128, 128 >>> ( ); if( cudaDeviceSynchronize() != cudaSuccess ) error("Failed to execute kernel:%s", cudaGetErrorString(cudaPeekAtLastError())); toc_run = getticks(); - message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms ); - - // TODO Unload timers (optional). + message( "run_CUDA took %.3f ms" , ((double)(toc_run - tic)) * itpms ); #else error("QuickSched was not compiled with CUDA support."); diff --git a/src/cuda_queue.h b/src/cuda_queue.h index 27f41c5f32674b8bdbc1c6249b44ccb60b39e161..b548f8ca512c7844b35609e85cc933f60a27acc7 100644 --- a/src/cuda_queue.h +++ b/src/cuda_queue.h @@ -16,7 +16,7 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ -#define cuda_numqueues 2 +#define cuda_numqueues 3 /** Type definition for the execution function in #qsched_run. */ typedef void (*qsched_funtype)( int , void * ); @@ -38,6 +38,8 @@ struct queue_cuda { /* The recycling list. */ volatile int *rec_data; + + volatile int nr_avail_tasks; }; diff --git a/src/qsched.c b/src/qsched.c index cbc148424029ca39014c752194fc8a41d766d1dc..eebbfdfc1be395b66e6d1602fe6bdb3ebeaf2ff9 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -95,7 +95,7 @@ void qsched_res_own ( struct qsched *s , qsched_res_t res , int owner ) { void qsched_ensure ( struct qsched *s , int nr_tasks , int nr_res , int nr_deps , int nr_locks , int nr_uses , int size_data ) { int dirty = 0; - + printf("Called qsched_ensure.\n"); /* Re-allocate tasks? */ if ( s->size < nr_tasks ) { dirty = 1; @@ -787,71 +787,6 @@ struct task* qsched_gettask ( struct qsched *s , int qid ) { } -/** - * @brief Sort the data according to the given indices. - * - * @param data The data to be sorted - * @param ind The indices with respect to which the data are sorted. - * @param N The number of entries - * @param min Lowest index. - * @param max highest index. - * - * This function calls qsched_quicksort. - */ - -void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) { - int *new_data; - int *new_ind; - int i; - if(N <= 0) - return; - new_data = (int*)malloc(sizeof(int) * N); - if(new_data == NULL) - error("Failed to allocate new_data"); - new_ind = (int*)malloc(sizeof(int) * N); - if(new_ind == NULL) - error("Failed to allocate new_ind"); - - /*Create buckets of size ? - Ideally <16 elements per bucket. Use max-min / N * 10 ? Should give average of 10 elements per bucket */ - int bucketsize = 1; - - /* To find bucket do ind-min / b and it goes in that bucket.*/ - int num_buckets = (max-min) / bucketsize +1; - int *bucket_inds = (int*) malloc(sizeof(int) * num_bucket); - if(bucket_inds == NULL) - error("Failed to allocate bucket_inds"); - memset(bucket_inds,0, sizeof(int)*num_buckets); - for(i = 0; i < N; i++) - { - bucket_inds[(ind[i]-min)]++; - } - for(i = 1; i < num_buckets; i++ ) - { - bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1]; - } - /* bucket_inds[i] contains the starting position for the i+1th bucket*/ - for(i = num_buckets-1; i >0; i--) - { - bucket_inds[i] = bucket_inds[i-1]; - } - bucket_inds[0] = 0; - - for(i = 0; i < N; i++) - { - int z = (ind[i]-min); - new_data[bucket_inds[z]] = data[i]; - new_ind[bucket_inds[z]++] = ind[i]; - } - - /* Copy data back to data and ind and deallocate everything!*/ - memcpy(data, new_data, N*sizeof(int)); - memcpy(ind, new_ind, N*sizeof(int)); - free(new_data); - free(new_ind); - free(bucket_inds); - -} - /** * @brief Sort the data according to the given indices. * @@ -872,7 +807,6 @@ void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min /* If N is small enough, just do insert sort. */ if ( N < 16 ) { - printf("%i\n", N); for ( i = 1 ; i < N ; i++ ) if ( ind[i] < ind[i-1] ) { temp_i = ind[i]; @@ -934,6 +868,167 @@ void qsched_quicksort ( int *restrict data , int *restrict ind , int N , int min } +int log_base2(unsigned int v) +{ +const unsigned int b[] = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000}; +const unsigned int S[] = {1, 2, 4, 8, 16}; +int i; + +register unsigned int r = 0; // result of log2(v) will go here +for (i = 4; i >= 0; i--) // unroll for speed... +{ + if (v & b[i]) + { + v >>= S[i]; + r |= S[i]; + } +} +return r; +} +/** + * @brief Sort the data according to the given indices. + * + * @param data The data to be sorted + * @param ind The indices with respect to which the data are sorted. + * @param N The number of entries + * @param min Lowest index. + * @param max highest index. + * + * This function calls qsched_quicksort. + */ + +void qsched_sort ( int *restrict data, int *restrict ind, int N, int min, int max ) { + int *new_data; + int *new_ind; + int i, j, temp_i, temp_d; + // printf("N=%i, max =%i, min = %i\n", N, max, min); + if(N <= 0) + return; + // if((max-min) > 10*N) + // { + // qsched_quicksort(data, ind, N, min, max); + // return; + // } +// printf("sort size: %i, max-min = %i\n",N, max-min); + /* If N is small enough, just do insert sort. */ + if ( N < 11 ) { +// printf("%i\n", N); + for ( i = 1 ; i < N ; i++ ) + if ( ind[i] < ind[i-1] ) { + temp_i = ind[i]; + temp_d = data[i]; + for ( j = i ; j > 0 && ind[j-1] > temp_i ; j-- ) { + ind[j] = ind[j-1]; + data[j] = data[j-1]; + } + ind[j] = temp_i; + data[j] = temp_d; + } + return; + } + + if((max-min) > 8*N) + { + int log = log_base2(max-min)-2; + log = log_base2((max-min)/8); + // int bucketsize = 1 << log; + /* Each bucket contains (max-min)/10 width.*/ + new_data = (int*)malloc(sizeof(int)*N); + new_ind = (int*)malloc(sizeof(int)*N); + int num_buckets = ((max-min) >> log) +1; + int *bucket_inds = (int*)malloc(sizeof(int)*num_buckets); + memset(bucket_inds, 0, sizeof(int)*num_buckets); + for(i = 0; i < N; i++) + { + // printf("%i\n", (ind[i]-min) >> log); + if(((ind[i]-min) >> log ) > num_buckets) + error("Exceeded num_buckets, ind[i]-min = %i, log = %i, ind[i]-min>>log=%i, ind[i] = %i, max = %i, num_buckets = %i, N=%i", ind[i]-min, log, (ind[i]-min)>>log, ind[i], max, num_buckets, N); + bucket_inds[(ind[i]-min) >> log]++; + } + for(i = 1; i < num_buckets; i++) + { + bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1]; + } + for(i = num_buckets-1; i > 0; i--) + { + bucket_inds[i] = bucket_inds[i-1]; + } + bucket_inds[0] = 0; + for(i = 0; i < N; i++) + { + int z = (ind[i]-min) >> log; + if(ind[i] > max || ind[i] < min) + error("ind[i]=%i, max = %i, min = %i\n", ind[i], max, min); + + new_data[bucket_inds[z]] = data[i]; + new_ind[bucket_inds[z]++] = ind[i]; + } + memcpy(data, new_data, N*sizeof(int)); + memcpy(ind, new_ind, N*sizeof(int)); + free(new_data); + free(new_ind); + for(i = num_buckets-1; i > 0; i--) + { + bucket_inds[i] = bucket_inds[i-1]; + } + bucket_inds[0] = 0; + /*Recusively sort each bucket. */ + for(i = 0; i < num_buckets-1; i++) + { +// printf("bucket_inds[i] = %i, N = %i, i = %i, num_buckets = %i\n", bucket_inds[i], N, i, num_buckets); + qsched_sort(&data[bucket_inds[i]], &ind[bucket_inds[i]], bucket_inds[i+1] - bucket_inds[i], min + (1<<log)*i, min + (1<<log)*(i+1) ); + } + return; + } + new_data = (int*)malloc(sizeof(int) * N); + if(new_data == NULL) + error("Failed to allocate new_data"); + new_ind = (int*)malloc(sizeof(int) * N); + if(new_ind == NULL) + error("Failed to allocate new_ind"); + + + int bucketsize = 1; + + /* To find bucket do ind-min / b and it goes in that bucket.*/ + int num_buckets = (max-min) / bucketsize +1; + int *bucket_inds = (int*) malloc(sizeof(int) * num_buckets); + if(bucket_inds == NULL) + error("Failed to allocate bucket_inds, max = %i, min = %i", max, min); + memset(bucket_inds,0, sizeof(int)*num_buckets); + for(i = 0; i < N; i++) + { + bucket_inds[(ind[i]-min)]++; + } + for(i = 1; i < num_buckets; i++ ) + { + bucket_inds[i] = bucket_inds[i] + bucket_inds[i-1]; + } + /* bucket_inds[i] contains the starting position for the i+1th bucket*/ + for(i = num_buckets-1; i >0; i--) + { + bucket_inds[i] = bucket_inds[i-1]; + } + bucket_inds[0] = 0; + + for(i = 0; i < N; i++) + { + int z = (ind[i]-min); + new_data[bucket_inds[z]] = data[i]; + new_ind[bucket_inds[z]++] = ind[i]; + } + + /* Copy data back to data and ind and deallocate everything!*/ + memcpy(data, new_data, N*sizeof(int)); + memcpy(ind, new_ind, N*sizeof(int)); + free(new_data); + free(new_ind); + free(bucket_inds); + +} + + + /** * @brief Prepare a #qsched for execution. @@ -1275,7 +1370,6 @@ void qsched_addunlock ( struct qsched *s , int ta , int tb ) { /* Do the deps need to be re-allocated? */ if ( s->count_deps == s->size_deps ) { - /* Scale the deps list size. */ s->size_deps *= qsched_stretch; @@ -1337,7 +1431,6 @@ int qsched_addtask ( struct qsched *s , int type , unsigned int flags , void *da /* Do the tasks need to be re-allocated? */ if ( s->count == s->size ) { - /* Scale the task list size. */ s->size *= qsched_stretch; diff --git a/src/task.h b/src/task.h index dccc2dae7e078510db30c7d368ad1c9769668978..99cd36dcb957f473c799036d2223f7c557dd441e 100644 --- a/src/task.h +++ b/src/task.h @@ -54,6 +54,10 @@ struct task { int qid; /* Task weight for queue selection. */ - int cost, weight; + int cost, weight; + + #ifdef WITH_CUDA + int blockID; + #endif };