diff --git a/examples/Makefile.am b/examples/Makefile.am index d05fbd7c6714be89285f5715bfa407b477204e77..df37d2cd20cc62ce9afb52dad7675d950e5d62ef 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -71,5 +71,5 @@ test_qr_mpi_LDFLAGS = $(MPI_THREAD_LIBS) test_qr_mpi_cblas_SOURCES = test_qr_mpi.c test_qr_mpi_cblas_CFLAGS = $(AM_CFLAGS) -DWITH_MPI -DWITH_CBLAS_LIB -test_qr_mpi_cblas_LDADD = ../src/.libs/libquickschedMPI.a $(METIS_LIBS) -llapacke -llapacke -lblas -lcblas +test_qr_mpi_cblas_LDADD = ../src/.libs/libquickschedMPI.a -llapacke -llapacke -lblas -lcblas $(METIS_LIBS) test_qr_mpi_cblas_LDFLAGS = $(MPI_THREAD_LIBS) diff --git a/examples/test_qr.c b/examples/test_qr.c index 005e83e9c69beb48ad51f6728d37e5e1981635c3..c52db67a5a7ccb17a2f01375cddd7f1d7dbb565e 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -30,6 +30,7 @@ #include <pthread.h> #include <cblas.h> +#include <lapacke.h> /* Local includes. */ #include "quicksched.h" @@ -497,7 +498,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { error("Unknown task type."); } } - + srand(6); /* Allocate and fill the original matrix. */ if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL || (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL || @@ -513,13 +514,13 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { bzero(tau, sizeof(double) * m * n * K); /* Dump A_orig. */ - /* message( "A_orig = [" ); + message( "A_orig = [" ); for ( k = 0 ; k < m*K ; k++ ) { for ( j = 0 ; j < n*K ; j++ ) printf( "%.3f " , A_orig[ j*m*K + k ] ); printf( "\n" ); } - printf( "];\n" ); */ + printf( "];\n" ); /* Initialize the scheduler. */ qsched_init(&s, nr_threads, qsched_flag_none); @@ -541,7 +542,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { data[0] = k; data[1] = k; data[2] = k; - tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data, + tid_new = qsched_addtask_local(&s, task_DGEQRF, task_flag_none, data, sizeof(int) * 3, 2); qsched_addlock(&s, tid_new, rid[k * m + k]); if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new); @@ -552,7 +553,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { data[0] = k; data[1] = j; data[2] = k; - tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data, + tid_new = qsched_addtask_local(&s, task_DLARFT, task_flag_none, data, sizeof(int) * 3, 3); qsched_addlock(&s, tid_new, rid[j * m + k]); qsched_adduse(&s, tid_new, rid[k * m + k]); @@ -568,7 +569,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { data[0] = i; data[1] = k; data[2] = k; - tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data, + tid_new = qsched_addtask_local(&s, task_DTSQRF, task_flag_none, data, sizeof(int) * 3, 3); qsched_addlock(&s, tid_new, rid[k * m + i]); qsched_adduse(&s, tid_new, rid[k * m + k]); @@ -581,12 +582,12 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { data[0] = i; data[1] = j; data[2] = k; - tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data, + tid_new = qsched_addtask_local(&s, task_DSSRFT, task_flag_none, data, sizeof(int) * 3, 5); qsched_addlock(&s, tid_new, rid[j * m + i]); qsched_adduse(&s, tid_new, rid[k * m + i]); - qsched_adduse(&s, tid_new, rid[j * m + k]); - // qsched_addunlock(&s, tid[k * m + i], tid_new); + qsched_addlock(&s, tid_new, rid[j * m + k]); + qsched_addunlock(&s, tid[k * m + i], tid_new); qsched_addunlock(&s, tid[j * m + (i - 1)], tid_new); if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); @@ -616,22 +617,22 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { } /* Dump A. */ - /* message( "A = [" ); + message( "A = [" ); for ( k = 0 ; k < m*K ; k++ ) { for ( j = 0 ; j < n*K ; j++ ) printf( "%.3f " , A[ j*m*K + k ] ); printf( "\n" ); } - printf( "];\n" ); */ + printf( "];\n" ); /* Dump tau. */ - /* message( "tau = [" ); + message( "tau = [" ); for ( k = 0 ; k < m*K ; k++ ) { for ( j = 0 ; j < n ; j++ ) printf( "%.3f " , tau[ j*m*K + k ] ); printf( "\n" ); } - printf( "];\n" ); */ + printf( "];\n" ); /* Dump the tasks. */ /* for ( k = 0 ; k < s.count ; k++ ) { @@ -654,7 +655,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { } /* Test if the decomposition was correct.*/ - /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); + double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); double *Q = computeQ(tempMatrix, m*K, K, tau, m); double *R = getR(tempMatrix, m*K); cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, @@ -670,7 +671,7 @@ void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { } free(tempMatrix); free(Q); - free(R);*/ + free(R); /* Clean up. */ free(A); diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c index 640bb4594dcc9381a725d39ac436f098bad0ce84..6bb7c1d47cdacdefde3699ff2ee98c57edb74f85 100644 --- a/examples/test_qr_mpi.c +++ b/examples/test_qr_mpi.c @@ -33,9 +33,8 @@ #include <mpi.h> -#ifdef WITH_CLBAS #include <cblas.h> -#endif + /* Local includes. */ #include "quicksched.h" @@ -639,7 +638,7 @@ task_j = (int*)calloc(sizeof(int), m*n*m*n); task_k = (int*)calloc(sizeof(int), m*n*m*n); #endif - + srand(6); /* Allocate and fill the original matrix. */ if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL || (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL || @@ -681,6 +680,17 @@ if(s.rank == 0) { for (k = 0; k < m * n; k++) { tid[k] = qsched_task_none; } + +#ifdef WITH_CBLAS_LIB + /* Dump A_orig. */ + message( "A_orig = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A_orig[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); +#endif /* Build the tasks. */ for (k = 0; k < m && k < n; k++) { @@ -907,20 +917,59 @@ for(i = 0; i < n; i++) { for(j = 0; j < m; j++) { - long long int id = tau_id[j*m+i]; + long long int id = tau_id[i*m+j]; struct res *r = &s.res[s.res_ranks[(id>>48)] + (id & 0xFFFFFFFFFFFFFF)]; if(r->node == s.rank) { - memcpy(&tau[(j*m+i)*K], qsched_getresdata(&s, r->ID), sizeof(double)*K ); + memcpy(&tau[(i*m+j)*K], qsched_getresdata(&s, r->ID), sizeof(double)*K ); } } } MPI_Allreduce(MPI_IN_PLACE, tau, m*n*K, MPI_DOUBLE, MPI_SUM, s.comm); +double *tau_new = (double*)calloc(sizeof(double), m*n*K); + + +for(i = 0; i < n; i++) +{ + double *column_old = &tau[i*m*K]; + double *column_new = &tau_new[i*m*K]; + + for(j = 0; j < m; j++) + { + for(k = 0; k < K; k++) + { + column_new[k*m+j] = column_old[j*K+k]; + } + } +} + +free(tau); +tau = tau_new; #ifdef WITH_CBLAS_LIB //This should check correctness. + +if(s.rank == 0){ + message( "A = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); + + /* Dump tau. */ + message( "tau = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n ; j++ ) + printf( "%.3f " , tau[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); + + double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); double *Q = computeQ(tempMatrix, m*K, K, tau, m); double *R = getR(tempMatrix, m*K); @@ -938,6 +987,7 @@ double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); free(tempMatrix); free(Q); free(R); +} #endif //TODO Clean up the resource data.