diff --git a/examples/test_qr_ompss.c b/examples/test_qr_ompss.c new file mode 100644 index 0000000000000000000000000000000000000000..93069cb62b2dbcd49de0a4d9a4cc043da22716fd --- /dev/null +++ b/examples/test_qr_ompss.c @@ -0,0 +1,394 @@ +/******************************************************************************* + * This file is part of QuickSched. + * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + + +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <omp.h> + +/* LAPACKE header. */ +#include <lapacke.h> +#include <cblas.h> + + +/* Local includes. */ +#include "quicksched.h" + + +/* + * Sam's routines for the tiled QR decomposition. + */ + +/* + \brief Computes 2-norm of a vector \f$x\f$ + + Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f] + */ +double do2norm(double* x, int l) +{ + double sum = 0, norm; + int i; + + for(i = 0; i < l; i++) + sum += x[i] * x[i]; + + norm = sqrt(sum); + + return norm; +} + +/** + * \brief Computes a Householder reflector from a pair of vectors from coupled blocks + * + * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks. + * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag + * + * \param topDiag The only non-zero element of the incoming vector in the top block + * \param ma The number of elements in the top vector + * \param xb Pointer to the lower vector + * \param l The number of elements in the whole vector + * \param vk A pointer to a pre-allocated array to store the householder vector of size l + * + * \returns void + */ +void calcvkDouble (double topDiag, + int ma, + double* xb, + int l, + double* vk) +{ + int sign, i; + double norm, div; + //same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable + + sign = topDiag >= 0.0 ? 1 : -1; + vk[0] = topDiag; + //use vk[0] as beta + for(i = 1; i < ma; i++) + vk[i] = 0; + + for(; i < l; i++) + vk[i] = xb[i - ma]; + + norm = do2norm(vk, l); + vk[0] += norm * sign; + + if(norm != 0.0) + { + div = 1/vk[0]; + + for(i = 1; i < l; i++) + vk[i] *= div; + } +} + + +void updateDoubleQ_WY (double* blockA, + double* blockB, + double* blockTau, + int k, int ma, int mb, int n, + int ldm, + double* hhVector)//bottom, essential part. +{ + int i, j; + + double tau = 1.0, beta; + + /* Compute tau = 2/v'v */ + for(i = 0; i < mb; i ++) + tau += hhVector[i] * hhVector[i]; + + tau = 2/tau; + + for(j = k; j < n; j ++) + { + /* Compute v'*b_j */ + beta = blockA[(j*ldm) + k]; + + /* Then for lower half */ + for(i = 0; i < mb; i ++) + beta += blockB[(j*ldm) + i] * hhVector[i]; + + beta *= tau; + + /* Compute b_j = b_j - beta*v_k */ + blockA[(j*ldm) + k] -= beta; + + for(i = 0; i < mb; i ++) + blockB[(j*ldm) + i] -= beta * hhVector[i]; + } + + /* Insert vector below diagonal. */ + for(i = 0; i < mb; i ++) + blockB[(k*ldm) + i] = hhVector[i]; + + blockTau[k] = tau; +} + +#pragma omp task in( blockA[0] ) inout( blockB[0] ) in( blockTau[0] ) +void DTSQRF (double* blockA, + double* blockB, + double* blockTau, + int ma, + int mb, + int n, + int ldm ) +{ + int k; + double* xVectA, *xVectB; + double hhVector[ 2*ma ]; + + xVectA = blockA; + xVectB = blockB; + + for(k = 0; k < n; k++) + { + //vk = sign(x[1])||x||_2e1 + x + //vk = vk/vk[0] + calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential + + //matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na) + //update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros. + updateDoubleQ_WY (blockA, blockB, + blockTau, + k, ma, mb, n, + ldm, + hhVector + ma - k); + + xVectA += ldm + 1; + xVectB += ldm; + } +} + +#pragma omp task in( blockV[0] ) in( blockB[0] ) inout( blockA[0] ) in( blockTau[0] ) +void DSSRFT (double* blockV, + double* blockA, double* blockB, + double* blockTau, + int b, int n, int ldm) +{ + int i, j, k; + + double tau, beta; + + /* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B, + and for each householder vector v of blockV */ + + /* For each column of B */ + for(j = 0; j < n; j ++) + { + /* For each householder vector. */ + for(k = 0; k < n; k ++) + { + /* tau = 2/v'v, computed earlier, stored in T(k,k). */ + tau = blockTau[k]; + + /* Compute beta = v_k'b_j. */ + /* v_k is >0 (=1) only at position k in top half. */ + beta = blockA[(j*ldm) + k]; + + /* For lower portion of v_k, aligning with the lower block */ + for(i = 0; i < b; i ++) + beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i]; + + beta *= tau; + + /* Compute b_j = b_j - beta * v */ + /* v_k = 1 at (k) in top half again */ + blockA[(j*ldm) + k] -= beta; + + /* Apply to bottom block. */ + for(i = 0; i < b; i ++) + blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i]; + } + } +} + + +/** + * @breif Wrapper to get the dependencies right. + */ + +#pragma omp task inout( a[0] ) inout( tau[0] ) +void DGEQRF ( int matrix_order, lapack_int m, lapack_int n, + double* a, lapack_int lda, double* tau ) { + LAPACKE_dgeqrf( matrix_order, m, n, a, lda, tau ); + } + + +/** + * @breif Wrapper to get the dependencies right. + */ + +#pragma omp task inout( v[0] ) in( tau[0] ) inout( t[0] ) +void DLARFT ( int matrix_order, char direct, char storev, + lapack_int n, lapack_int k, const double* v, + lapack_int ldv, const double* tau, double* t, + lapack_int ldt ) { + LAPACKE_dlarft_work( matrix_order, direct, storev, n, k, v, ldv, tau, t, ldt ); + } + + +/** + * @brief Computed a tiled QR factorization using QuickSched. + * + * @param m Number of tile rows. + * @param n Number of tile columns. + * @param nr_threads Number of threads to use. + */ + +void test_qr ( int m , int n , int K , int nr_threads , int runs ) { + + int k, j, i, r; + double *A, *A_orig, *tau; + ticks tic, toc_run, tot_setup, tot_run = 0; + + + /* Allocate and fill the original matrix. */ + if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL || + ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL || + ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ) + error( "Failed to allocate matrices." ); + for ( k = 0 ; k < m * n * K * K ; k++ ) + A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0; + memcpy( A , A_orig , sizeof(double) * m * n * K * K ); + bzero( tau , sizeof(double) * m * n * K ); + + /* Dump A_orig. */ + /* message( "A_orig = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A_orig[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Loop over the number of runs. */ + for ( r = 0 ; r < runs ; r++ ) { + + /* Start the clock. */ + tic = getticks(); + + /* Launch the tasks. */ + for ( k = 0 ; k < m && k < n ; k++ ) { + + /* Add kth corner task. */ + DGEQRF( LAPACK_COL_MAJOR , K, K , + &A[ k*m*K*K + k*K ] , m*K , &tau[ k*m*K + k*K ] ); + + /* Add column tasks on kth row. */ + for ( j = k+1 ; j < n ; j++ ) { + DLARFT( LAPACK_COL_MAJOR , 'F' , 'C' , + K , K , &A[ k*m*K*K + k*K ] , + m*K , &tau[ k*m*K + k*K ] , &A[ j*m*K*K + k*K ] , + m*K ); + } + + /* For each following row... */ + for ( i = k+1 ; i < m ; i++ ) { + + /* Add the row taks for the kth column. */ + DTSQRF( &A[ k*m*K*K + k*K ] , &A[ k*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K , K*m ); + + /* Add the inner tasks. */ + for ( j = k+1 ; j < n ; j++ ) { + DSSRFT( &A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m ); + } + + } + + } /* build the tasks. */ + + /* Collect timers. */ + toc_run = getticks(); + message( "%ith run took %lli ticks..." , r , toc_run - tic ); + tot_run += toc_run - tic; + + } + + + /* Dump the costs. */ + message( "costs: setup=%lli ticks, run=%lli ticks." , + tot_setup , tot_run/runs ); + + } + + +/** + * @brief Main function. + */ + +int main ( int argc , char *argv[] ) { + + int c, nr_threads; + int M = 4, N = 4, runs = 1, K = 32; + + /* Get the number of threads. */ + #pragma omp parallel shared(nr_threads) + { + if ( omp_get_thread_num() == 0 ) + nr_threads = omp_get_num_threads(); + } + + /* Parse the options */ + while ( ( c = getopt( argc , argv , "m:n:k:r:t:" ) ) != -1 ) + switch( c ) { + case 'm': + if ( sscanf( optarg , "%d" , &M ) != 1 ) + error( "Error parsing dimension M." ); + break; + case 'n': + if ( sscanf( optarg , "%d" , &N ) != 1 ) + error( "Error parsing dimension M." ); + break; + case 'k': + if ( sscanf( optarg , "%d" , &K ) != 1 ) + error( "Error parsing tile size." ); + break; + case 'r': + if ( sscanf( optarg , "%d" , &runs ) != 1 ) + error( "Error parsing number of runs." ); + break; + case 't': + if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) + error( "Error parsing number of threads." ); + omp_set_num_threads( nr_threads ); + break; + case '?': + fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] ); + fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n" + "matrix using nr_threads threads.\n" ); + exit( EXIT_FAILURE ); + } + + /* Dump arguments. */ + message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , + 32*M , 32*N , nr_threads , runs ); + + test_qr( M , N , K , nr_threads , runs ); + + } + +