diff --git a/examples/Makefile.am b/examples/Makefile.am index d91f91d7fb3816dde5f6114c9d8e78677e60803d..2d20cbb1630044ced0bfd7b6d87d05e38696623b 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -20,13 +20,13 @@ AUTOMAKE_OPTIONS=gnu # Add the source directory and debug to CFLAGS -AM_CFLAGS = -g -Wall -Werror -I../src $(OPENMP_CFLAGS) -DCPU_TPS=2.67e9 -DTIMERS \ +AM_CFLAGS = -g -Wall -Werror -I../src $(OPENMP_CFLAGS) -DCPU_TPS=3.1e9 -DTIMERS \ # -fsanitize=address -fno-omit-frame-pointer -AM_LDFLAGS = -lm # -fsanitize=address + AM_LDFLAGS = -lm # -fsanitize=address # Set-up the library -bin_PROGRAMS = test test_qr test_bh +bin_PROGRAMS = test test_qr #test_bh # Sources for test test_SOURCES = test.c @@ -35,11 +35,19 @@ test_LDADD = ../src/.libs/libquicksched.a # Sources for test_qr test_qr_SOURCES = test_qr.c -test_qr_CFLAGS = $(AM_CFLAGS) -test_qr_LDADD = ../src/.libs/libquicksched.a -llapacke -llapack -lblas +test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/lapack-3.5.0/lapacke/include/ +test_qr_LDFLAGS = -I/home/aidan/lapack-3.5.0/lapacke/include/ +test_qr_LDADD = ../src/.libs/libquicksched.a /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a -lblas # Sources for test_bh -test_bh_SOURCES = test_bh.c -test_bh_CFLAGS = $(AM_CFLAGS) -test_bh_LDADD = ../src/.libs/libquicksched.a +#test_bh_SOURCES = test_bh.c +#test_bh_CFLAGS = $(AM_CFLAGS) +#test_bh_LDADD = ../src/.libs/libquicksched.a + +#if HAVE_CUDA +#test_cuda_SOURCES = test_gpu_simple.cu +#test_cuda_CFLAGS = -DWITH_CUDA $(AM_CFLAGS) $(CUDA_CFLAGS) $(CUDA_MYFLAGS) +#test_cuda_LINK = $(NVCC) +#test_cuda_LDADD = ../src/.libs/libquicksched_cuda.a $(CUDA_LIBS) +#endif diff --git a/examples/test.c b/examples/test.c index 9d181aa1d4b00ff2e5900d039ca9ab658d07f676..66dfae6af5e7128dcad3f96f952da6d6bea1af82 100644 --- a/examples/test.c +++ b/examples/test.c @@ -118,7 +118,7 @@ void test2 ( int m , int n , int k , int nr_threads ) { /* Build a task for each tile of the matrix c. */ for ( i = 0 ; i < m ; i++ ) for ( j = 0 ; j < n ; j++ ) { - rid = qsched_addres( &s , qsched_owner_none , qsched_res_none ); + rid = qsched_addres( &s , qsched_owner_none , qsched_res_none, NULL , 0 ); data[0] = i; data[1] = j; for ( kk = 0 ; kk < k ; kk++ ) { data[2] = kk; @@ -222,7 +222,7 @@ void test1 ( int m , int n , int k , int nr_threads ) { for ( i = 0 ; i < m ; i++ ) for ( j = 0 ; j < n ; j++ ) { data[0] = i; data[1] = j; - rid = qsched_addres( &s , qsched_owner_none , qsched_res_none ); + rid = qsched_addres( &s , qsched_owner_none , qsched_res_none , NULL , 0); tid = qsched_addtask( &s , 1 , task_flag_none , data , 2*sizeof(int) , 1 ); qsched_addlock( &s , tid , rid ); } diff --git a/examples/test_qr.c b/examples/test_qr.c index 2a754c04d4976e138170aaaf261bfcd8afe77ba5..6f77203afbba2c5b3e1c0a953eb9b89323609187 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -37,7 +37,6 @@ /* Local includes. */ #include "quicksched.h" - /* * Sam's routines for the tiled QR decomposition. */ @@ -47,9 +46,9 @@ Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f] */ -double do2norm(double* x, int l) +float do2norm(float* x, int l) { - double sum = 0, norm; + float sum = 0, norm; int i; for(i = 0; i < l; i++) @@ -74,14 +73,14 @@ double do2norm(double* x, int l) * * \returns void */ -void calcvkDouble (double topDiag, +void calcvkfloat (float topDiag, int ma, - double* xb, + float* xb, int l, - double* vk) + float* vk) { int sign, i; - double norm, div; + float norm, div; //same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable sign = topDiag >= 0.0 ? 1 : -1; @@ -106,16 +105,16 @@ void calcvkDouble (double topDiag, } -void updateDoubleQ_WY (double* blockA, - double* blockB, - double* blockTau, +void updatefloatQ_WY (float* blockA, + float* blockB, + float* blockTau, int k, int ma, int mb, int n, int ldm, - double* hhVector)//bottom, essential part. + float* hhVector)//bottom, essential part. { int i, j; - double tau = 1.0, beta; + float tau = 1.0, beta; /* Compute tau = 2/v'v */ for(i = 0; i < mb; i ++) @@ -148,17 +147,17 @@ void updateDoubleQ_WY (double* blockA, blockTau[k] = tau; } -void DTSQRF (double* blockA, - double* blockB, - double* blockTau, +void DTSQRF (float* blockA, + float* blockB, + float* blockTau, int ma, int mb, int n, int ldm, - double* hhVector) + float* hhVector) { int k; - double* xVectA, *xVectB; + float* xVectA, *xVectB; xVectA = blockA; xVectB = blockB; @@ -167,11 +166,11 @@ void DTSQRF (double* blockA, { //vk = sign(x[1])||x||_2e1 + x //vk = vk/vk[0] - calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential + calcvkfloat(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential //matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na) //update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros. - updateDoubleQ_WY (blockA, blockB, + updatefloatQ_WY (blockA, blockB, blockTau, k, ma, mb, n, ldm, @@ -182,14 +181,14 @@ void DTSQRF (double* blockA, } } -void DSSRFT (double* blockV, - double* blockA, double* blockB, - double* blockTau, +void DSSRFT (float* blockV, + float* blockA, float* blockB, + float* blockTau, int b, int n, int ldm) { int i, j, k; - double tau, beta; + float tau, beta; /* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B, and for each householder vector v of blockV */ @@ -223,7 +222,89 @@ void DSSRFT (double* blockV, } } } + +float* randomMatrix(int m, int n) +{ + float* Matrix; + Matrix = (float*) malloc( sizeof(float) * m*n*32*32); + if(Matrix == NULL) + error("Failed to allocate Matrix"); + int r,c; + m = m*32; + n = n*32; +for(c = 0; c < n; c++) + { + for(r = 0; r < m; r++) + { + //CO(i,j,m) ((m * j) + i) + Matrix[(m*c)+r] = ((float)(rand() % 201) - 100.0) / 100.0; + } + } +return Matrix; + +} +float* generateMatrix( int m, int n) +{ + float* Matrix; + Matrix = (float*) malloc( sizeof(float) * m*n*32*32); + if(Matrix == NULL) + error("Failed to allocate Matrix"); + int i, j; + memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); + + for(i = 0; i < n*32; i++) + { + for(j = 0; j < m*32; j++) + { + Matrix[i*m*32 + j] = (float)(i+j); + } + } + return Matrix; +} + +float* createIdentity(int m, int n) +{ + float* Matrix; + Matrix = (float*) malloc( sizeof(float) * m*n*32*32); + if(Matrix == NULL) + error("Failed to allocate Matrix"); + int i, j; + memset ( Matrix, 0, sizeof(float)*m*n*32*32 ); + + for(i = 0; i < n*32; i++) + { + for(j = 0; j < m*32; j++) + { + if(i==j) + { + Matrix[i*m*32 + j] = 1.0; + }else + { + Matrix[i*m*32 + j] = 0.0; + } + } + } + return Matrix; +} + +void printMatrix(float* Matrix, int m, int n) +{ + int i, j; + + for(i=0; i < m*32; i++) + { + printf("{ "); + for(j = 0; j < n*32; j++) + { + printf(" %.3f ", Matrix[j*m*32 +i]); + + } + printf(" }"); + printf("\n"); + } + +} /** * @brief Computed a tiled QR factorization using QuickSched. @@ -236,7 +317,7 @@ void DSSRFT (double* blockV, void test_qr ( int m , int n , int K , int nr_threads , int runs ) { int k, j, i; - double *A, *A_orig, *tau; + float *A, *A_orig, *tau; struct qsched s; qsched_task_t *tid, tid_new; qsched_res_t *rid; @@ -251,27 +332,28 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { /* Decode the task data. */ int *idata = (int *)data; - int i = idata[0], j = idata[1], k = idata[2]; - double buff[ 2*K*K ]; + int i = idata[0], j = idata[1];//, k = idata[2]; + float buff[ 2*K*K ]; /* Decode and execute the task. */ switch ( type ) { case task_DGEQRF: - LAPACKE_dgeqrf_work( LAPACK_COL_MAJOR , K, K , + LAPACKE_sgeqrf_work( LAPACK_COL_MAJOR , K, K , &A[ j*m*K*K + i*K ] , m*K , &tau[ j*m*K + i*K ] , buff , 2*K*K ); + break; case task_DLARFT: - LAPACKE_dlarft_work( LAPACK_COL_MAJOR , 'F' , 'C' , + LAPACKE_slarft_work( LAPACK_COL_MAJOR , 'F' , 'C' , K , K , &A[ i*m*K*K + i*K ] , m*K , &tau[ i*m*K + i*K ] , &A[ j*m*K*K + i*K ] , m*K ); break; case task_DTSQRF: - DTSQRF( &A[ j*m*K*K + j*K ] , &A[ j*m*K*K + i*K ] , &tau[ j*m*K + i*K ] , K , K , K , K*m , buff ); + //DTSQRF( &A[ j*m*K*K + j*K ] , &A[ j*m*K*K + i*K ] , &tau[ j*m*K + i*K ] , K , K , K , K*m , buff ); break; case task_DSSRFT: - DSSRFT( &A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m ); + //DSSRFT( &A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m ); break; default: error( "Unknown task type." ); @@ -281,15 +363,28 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { /* Allocate and fill the original matrix. */ - if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL || - ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL || - ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ) + if ( ( A = (float *)malloc( sizeof(float) * m * n * K * K ) ) == NULL || + ( tau = (float *)malloc( sizeof(float) * m * n * K ) ) == NULL || + ( A_orig = (float *)malloc( sizeof(float) * m * n * K * K ) ) == NULL ) error( "Failed to allocate matrices." ); - for ( k = 0 ; k < m * n * K * K ; k++ ) - A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0; - memcpy( A , A_orig , sizeof(double) * m * n * K * K ); - bzero( tau , sizeof(double) * m * n * K ); + free(A_orig); +// for ( k = 0 ; k < m * n * K * K ; k++ ) + // A_orig[k] = 2*((float)rand()) / RAND_MAX - 1.0; +// A_orig = generateMatrix(m, n); + srand(5); + A_orig = randomMatrix(m,n); + printMatrix(A_orig, m, n); + printf("\n\n\n"); + memcpy( A , A_orig , sizeof(float) * m * n * K * K ); + bzero( tau , sizeof(float) * m * n * K ); + + LAPACKE_sgeqrf( LAPACK_COL_MAJOR , K*m, K*n , + A, m*K , tau); + printMatrix(A, m , n ); + printf("\n\n\n"); + memcpy( A , A_orig , sizeof(float) * m * n * K * K ); + bzero( tau , sizeof(float) * m * n * K ); /* Dump A_orig. */ /* message( "A_orig = [" ); for ( k = 0 ; k < m*K ; k++ ) { @@ -309,7 +404,7 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { error( "Failed to allocate tid/rid matrix." ); for ( k = 0 ; k < m * n ; k++ ) { tid[k] = qsched_task_none; - rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none ); + rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none , NULL, 0 ); } /* Build the tasks. */ @@ -386,7 +481,13 @@ void test_qr ( int m , int n , int K , int nr_threads , int runs ) { tot_run += toc_run - tic; } - + printf("tau = "); + for(k = 0; k < m * n * K ; k++) + { + printf("%.3f ", tau[k]); + } + printf("\n"); + printMatrix(A, m, n); /* Dump A. */ /* message( "A = [" );