diff --git a/examples/Makefile.am b/examples/Makefile.am index 152221294d79d6d628c5dc5c5776bc9f2e309f28..9708f27ed737f34bc9bb425d4b6bb0529b8feb8a 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -23,23 +23,22 @@ AUTOMAKE_OPTIONS=gnu AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \ -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ -DCPU_TPS=2.67e9 -DTIMERS \ - #-fsanitize=address -fno-omit-frame-pointer + # -fsanitize=address -fno-omit-frame-pointer AM_LDFLAGS = -lm # -fsanitize=address # Set-up the library -bin_PROGRAMS = test test_bh test_bh_2 test_bh_3 test_bh_sorted test_qr -#test_qr +bin_PROGRAMS = test test_qr test_bh test_bh_2 test_bh_3 test_bh_sorted + # Sources for test test_SOURCES = test.c test_CFLAGS = $(AM_CFLAGS) test_LDADD = ../src/.libs/libquicksched.a # Sources for test_qr -#-L/usr/bin64/atlas/ /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a -test_qr_SOURCES = test_qr.c /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a -test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/ATLAS/ATLAS_linux/include #-I/home/aidan/lapack-3.5.0/lapacke/include -test_qr_LDADD = ../src/.libs/libquicksched.a -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/ +test_qr_SOURCES = test_qr.c +test_qr_CFLAGS = $(AM_CFLAGS) +test_qr_LDADD = ../src/.libs/libquicksched.a -llapacke -llapacke -lblas # Sources for test_bh test_bh_SOURCES = test_bh.c diff --git a/examples/test_qr.c b/examples/test_qr.c index ada33a33acf4833ed478e10bcd0f0cadf466a5e5..b9cdbecde7b6a2d430f0afbbe24015c88ba8a028 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -1,22 +1,21 @@ /******************************************************************************* * This file is part of QuickSched. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ - + * +* *****************************************************************************/ /* Config parameters. */ #include "../config.h" @@ -30,187 +29,161 @@ #include <omp.h> #include <pthread.h> - - - #include <cblas.h> - /* Local includes. */ #include "quicksched.h" /** - * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices. - * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR. + * Takes a column major matrix, NOT tile major. size is length of a side of the + * matrix. Only works for square matrices. + * This function is simply for validation and is implemented naively as we know + * of no implementation to retrieve Q from the tiled QR. */ -double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) -{ - double* Q = malloc(sizeof(double)*size*size); - double* Qtemp = malloc(sizeof(double)*size*size); - double* w = malloc(sizeof(double)*size); - double* ww = malloc(sizeof(double)*size*size); - double* temp = malloc(sizeof(double)*size*size); - int i, k, l, j, n; - bzero(Q, sizeof(double)*size*size); - bzero(Qtemp, sizeof(double)*size*size); - bzero(ww, sizeof(double)*size*size); - for(i = 0; i < size; i++) - { - Q[i*size + i] = 1.0; - } - int numcoltile = size / tilesize; - int numrowtile = size / tilesize; - for(k = 0; k < numrowtile; k++) - { - for(l = 0; l < tilesize; l++) - { - bzero(Qtemp, sizeof(double)*size*size); - for(i = 0; i < size; i++) - { - Qtemp[i*size + i] = 1.0; - } - - - for(i = k; i < numcoltile; i++) - { - bzero(w, sizeof(double)*size); - - for(j = 0 ; j < tilesize; j++) - { - w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j]; - } - w[k*tilesize+l] = 1.0; - if(k*tilesize+l > i*tilesize) - { - for(j = 0; j < k*tilesize+l; j++) - w[j] = 0.0; - } - - - /* Compute (I - tau*w*w')' */ - for(j = 0; j < size; j++) - { - for(n = 0; n < size; n++) - { - if(j != n) - ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n]; - else - ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n]; - } - } - - /* Qtemp = Qtemp * (I-tau*w*w')' */ - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Qtemp, size, ww, size, 0.0, temp, size); - double *b = Qtemp; - Qtemp = temp; - temp = b; - } - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size); - double *b = Q; - Q = temp; - temp = b; +double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) { + double* Q = malloc(sizeof(double) * size * size); + double* Qtemp = malloc(sizeof(double) * size * size); + double* w = malloc(sizeof(double) * size); + double* ww = malloc(sizeof(double) * size * size); + double* temp = malloc(sizeof(double) * size * size); + int i, k, l, j, n; + bzero(Q, sizeof(double) * size * size); + bzero(Qtemp, sizeof(double) * size * size); + bzero(ww, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + Q[i * size + i] = 1.0; + } + int numcoltile = size / tilesize; + int numrowtile = size / tilesize; + for (k = 0; k < numrowtile; k++) { + for (l = 0; l < tilesize; l++) { + bzero(Qtemp, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + Qtemp[i * size + i] = 1.0; + } + + for (i = k; i < numcoltile; i++) { + bzero(w, sizeof(double) * size); + + for (j = 0; j < tilesize; j++) { + w[i * tilesize + j] = + HR[(k * tilesize + l) * size + i * tilesize + j]; + } + w[k * tilesize + l] = 1.0; + if (k * tilesize + l > i * tilesize) { + for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0; } + + /* Compute (I - tau*w*w')' */ + for (j = 0; j < size; j++) { + for (n = 0; n < size; n++) { + if (j != n) + ww[n * size + j] = + -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + else + ww[n * size + j] = + 1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + } + } + + /* Qtemp = Qtemp * (I-tau*w*w')' */ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Qtemp, size, ww, size, 0.0, temp, size); + double* b = Qtemp; + Qtemp = temp; + temp = b; + } + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Q, size, Qtemp, size, 0.0, temp, size); + double* b = Q; + Q = temp; + temp = b; } - - free(Qtemp); - free(w); - free(ww); - free(temp); - return Q; + } + + free(Qtemp); + free(w); + free(ww); + free(temp); + return Q; } -double* getR(double*HR, int size) -{ - double* R = malloc(sizeof(double) * size * size); - int i, j; - bzero(R, sizeof(double) * size * size); - for(i = 0; i< size; i++) - { - for(j = 0; j <= i; j++) - { - R[i*size + j] = HR[i*size + j]; - } +double* getR(double* HR, int size) { + double* R = malloc(sizeof(double) * size * size); + int i, j; + bzero(R, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + for (j = 0; j <= i; j++) { + R[i * size + j] = HR[i * size + j]; } - return R; + } + return R; } -void printMatrix(double* Matrix, int m, int n, int tilesize) -{ - int i, j; - - for(i=0; i < m*tilesize; i++) - { - for(j = 0; j < n*tilesize; j++) - { - printf(" %.3f ", Matrix[j*m*tilesize +i]); +void printMatrix(double* Matrix, int m, int n, int tilesize) { + int i, j; - } - printf("\n"); + for (i = 0; i < m * tilesize; i++) { + for (j = 0; j < n * tilesize; j++) { + printf(" %.3f ", Matrix[j * m * tilesize + i]); } - + printf("\n"); + } } -double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize) -{ - double* TileMatrix; - TileMatrix = malloc(sizeof(double) * size ); - if(TileMatrix == NULL) - error("failed to allocate TileMatrix"); - int i,j,k,l; - - for( i = 0; i < n ; i++ ) - { - for( j = 0; j < m; j++ ) - { - double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize]; - double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize]; - for(k = 0; k < tilesize; k++) - { - tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize]; - for( l = 0; l < tilesize; l++ ) - { - tilePos[k*tilesize + l] = tileStart[l]; - } - } +double* columnToTile(double* columnMatrix, int size, int m, int n, + int tilesize) { + double* TileMatrix; + TileMatrix = malloc(sizeof(double) * size); + if (TileMatrix == NULL) error("failed to allocate TileMatrix"); + int i, j, k, l; + + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + double* tileStart = + &columnMatrix[i * m * tilesize * tilesize + j * tilesize]; + double* tilePos = + &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + for (k = 0; k < tilesize; k++) { + tileStart = &columnMatrix[i * m * tilesize * tilesize + + k * m * tilesize + j * tilesize]; + for (l = 0; l < tilesize; l++) { + tilePos[k * tilesize + l] = tileStart[l]; } + } } + } - return TileMatrix; - + return TileMatrix; } -double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize) -{ - double* ColumnMatrix; - ColumnMatrix = (double*) malloc(sizeof(double) * size ); - if(ColumnMatrix == NULL) - error("failed to allocate ColumnMatrix"); - int i,j,k,l; - for( i = 0; i < n ; i++ ) - { - for(j = 0; j < m ; j++ ) - { - /* Tile on ith column is at i*m*32*32.*/ - /* Tile on jth is at j*32*32 */ - double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize]; - /* Column starts at same position as tile. */ - /* Row j*32.*/ - double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize]; - for( k = 0; k < tilesize; k++ ) - { - for(l=0; l < tilesize; l++) - { - tilePos[l] = tile[l]; - } - /* Next 32 elements are the position of the tile in the next column.*/ - tile = &tile[tilesize]; - /* Move to the j*32th position in the next column. */ - tilePos = &tilePos[tilesize*m]; - - } - } +double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) { + double* ColumnMatrix; + ColumnMatrix = (double*)malloc(sizeof(double) * size); + if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix"); + int i, j, k, l; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + double* tile = + &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + double* tilePos = + &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize]; + for (k = 0; k < tilesize; k++) { + for (l = 0; l < tilesize; l++) { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[tilesize]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[tilesize * m]; + } } - return ColumnMatrix; + } + return ColumnMatrix; } /* Routines for the tiled QR decomposition.*/ @@ -219,89 +192,78 @@ double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize * * @brief Computes the QR decomposition of a tile. * - * @param cornerTile A pointer to the tile for which the decomposition is computed. + * @param cornerTile A pointer to the tile for which the decomposition is +* computed. * @param tilesize The number of elements on a row/column of the tile. * @param tauMatrix A pointer to the tau Matrix. * @param k The value of k for the QR decomposition - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w) -{ - int i, j, n; - double norm=0.0, sign, u1, tau, z; - - - /*Find the householder vector for each row. */ - for(i = 0; i < tileSize; i++) - { - norm = 0.0; - /*Fill w with the vector.*/ - for(j=i; j < tileSize; j++) - { - /* ith row is i*tileSize, only want elements on diagonal or below.*/ - w[ j ] = cornerTile[ i*tileSize +j ]; - /*Find the norm as well */ - norm = norm + w[j]*w[j]; - } - if(w[i] >= 0.0) - sign = -1; - else - sign = 1; +void DGEQRF(double* restrict cornerTile, int tileSize, + double* restrict tauMatrix, int k, int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[tileSize]; + + /*Find the householder vector for each row. */ + for (i = 0; i < tileSize; i++) { + norm = 0.0; + /*Fill w with the vector.*/ + for (j = i; j < tileSize; j++) { + /* ith row is i*tileSize, only want elements on diagonal or below.*/ + w[j] = cornerTile[i * tileSize + j]; + /*Find the norm as well */ + norm = norm + w[j] * w[j]; + } + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; - norm = sqrt(norm); + norm = sqrt(norm); - u1 = w[i] - sign*norm; + u1 = w[i] - sign * norm; - if(u1 != 0.0) - { - for(j=i+1; j < tileSize; j++) - w[j] = w[j] / u1; - } - else - { - for(j=i+1; j < tileSize; j++) - w[j] = 0.0; - } + if (u1 != 0.0) { + for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1; + } else { + for (j = i + 1; j < tileSize; j++) w[j] = 0.0; + } - if(norm != 0.0) - tau = -sign*u1/norm; - else - tau = 0.0; + if (norm != 0.0) + tau = -sign * u1 / norm; + else + tau = 0.0; - /*Store the below diagonal vector */ - - for(j = i+1; j < tileSize; j++) - { - cornerTile[ i*tileSize +j ] = w[j]; - } - cornerTile[ i*tileSize + i ] = sign*norm; - w[i] = 1.0; - /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */ - for(j = i+1; j < tileSize; j++) - { - /*Compute w'*A_j*/ - z = cornerTile[ j*tileSize+i]; - for(n = i+1; n < tileSize; n++) - { - z = z + cornerTile[j*tileSize+n] * w[n]; - } - /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ - for(n=i; n < tileSize; n++) - { - cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z; - } - - } - /* Store tau. We're on k*tileSize+ith row. kth column.*/ - tauMatrix[(k*tileSize+i)*tauNum+k] = tau; + /*Store the below diagonal vector */ + for (j = i + 1; j < tileSize; j++) { + cornerTile[i * tileSize + j] = w[j]; } - + cornerTile[i * tileSize + i] = sign * norm; + w[i] = 1.0; + /* Apply the householder transformation to the rest of the tile, for + * everything to the right of the diagonal. */ + for (j = i + 1; j < tileSize; j++) { + /*Compute w'*A_j*/ + z = cornerTile[j * tileSize + i]; + for (n = i + 1; n < tileSize; n++) { + z = z + cornerTile[j * tileSize + n] * w[n]; + } + /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ + for (n = i; n < tileSize; n++) { + cornerTile[j * tileSize + n] = + cornerTile[j * tileSize + n] - tau * w[n] * z; + } + } + /* Store tau. We're on k*tileSize+ith row. kth column.*/ + tauMatrix[(k * tileSize + i) * tauNum + k] = tau; + } } - /** * * @brief Applies the householder factorisation of the corner to the row tile. @@ -312,46 +274,40 @@ void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatri * @param tauMatrix A pointer to the tau Matrix. * @param jj The value of j for the QR decomposition. * @param kk The value of k for the QR decomposition. - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w) -{ - int i, j, n; - double z=0.0; - - - /* For each row in the corner Tile*/ - for(i = 0; i < tileSize; i++) - { - /*Get w for row i */ - for(j = i; j < tileSize; j++) - { - w[j] = cornerTile[i*tileSize + j]; - } - w[i] = 1.0; - - /* Apply to the row Tile */ - for(j = 0; j < tileSize; j++) - { - z=0.0; - /* Below Diagonal!*/ - /*Compute w'*A_j*/ - for(n = i; n < tileSize; n++) - { - z = z + w[n] * rowTile[j*tileSize+n]; - } - for(n = i; n < tileSize; n++) - { - rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; - } - } - - +void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, + int jj, int kk, double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z = 0.0; + double w[tileSize]; + + /* For each row in the corner Tile*/ + for (i = 0; i < tileSize; i++) { + /*Get w for row i */ + for (j = i; j < tileSize; j++) { + w[j] = cornerTile[i * tileSize + j]; } - - + w[i] = 1.0; + + /* Apply to the row Tile */ + for (j = 0; j < tileSize; j++) { + z = 0.0; + /* Below Diagonal!*/ + /*Compute w'*A_j*/ + for (n = i; n < tileSize; n++) { + z = z + w[n] * rowTile[j * tileSize + n]; + } + for (n = i; n < tileSize; n++) { + rowTile[j * tileSize + n] = + rowTile[j * tileSize + n] - + tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z; + } + } + } } /** @@ -364,83 +320,77 @@ void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, * @param tauMatrix A pointer to the tau Matrix. * @param ii The value of i for the QR decomposition. * @param kk The value of k for the QR decomposition. - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w ) -{ - int i, j, n; - double norm=0.0, sign, u1, tau, z; - - /* For each column compute the householder vector. */ - for(i = 0; i < tilesize; i++) - { - norm = 0.0; - w[i] = cornerTile[ i*tilesize+i ]; - norm = norm + w[i]*w[i]; - for(j = i+1; j < tilesize; j++) - { - w[j] = 0.0; - } - for(j = 0; j < tilesize; j++) - { - w[tilesize+j] = columnTile[ i*tilesize+j ]; - norm = norm + w[tilesize+j]*w[tilesize+j]; - } - - norm = sqrt(norm); - if(w[i] >= 0.0) - sign = -1; - else - sign = 1; - - - u1 = w[i] - sign*norm; - if(u1 != 0.0) - { - for(j = i+1; j < 2*tilesize; j++){ - w[j] = w[j]/u1; - } - }else - { - for(j = i+1; j < 2*tilesize; j++) - w[j] = 0.0; - } +void DTSQRF(double* restrict cornerTile, double* restrict columnTile, + int tilesize, int ii, int kk, double* restrict tauMatrix, + int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[2*tilesize]; + + /* For each column compute the householder vector. */ + for (i = 0; i < tilesize; i++) { + norm = 0.0; + w[i] = cornerTile[i * tilesize + i]; + norm = norm + w[i] * w[i]; + for (j = i + 1; j < tilesize; j++) { + w[j] = 0.0; + } + for (j = 0; j < tilesize; j++) { + w[tilesize + j] = columnTile[i * tilesize + j]; + norm = norm + w[tilesize + j] * w[tilesize + j]; + } - if(norm != 0) - tau = -sign*u1/norm; - else - tau = 0.0; - - /* Apply to each row to the right.*/ - for(j = i; j < tilesize; j++) - { - /* Find w'*A_j, w is 0s except for first value with upper tile.*/ - z = 1.0 * cornerTile[ j*tilesize+i ]; - for(n = 0; n < tilesize; n++) - { - z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ]; - } - /* Apply to upper tile.*/ - cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z; - for(n = i+1; n < tilesize; n++) - { - cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z; - } - /* Apply to lower tile.*/ - for(n = 0; n < tilesize; n++) - { - columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z; - } - - } - /* Store w*/ - for(j = 0; j < tilesize; j++){ - columnTile[ i*tilesize+j ] = w[tilesize+j]; - } - tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau; + norm = sqrt(norm); + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; + + u1 = w[i] - sign * norm; + if (u1 != 0.0) { + for (j = i + 1; j < 2 * tilesize; j++) { + w[j] = w[j] / u1; + } + } else { + for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0; } + + if (norm != 0) + tau = -sign * u1 / norm; + else + tau = 0.0; + + /* Apply to each row to the right.*/ + for (j = i; j < tilesize; j++) { + /* Find w'*A_j, w is 0s except for first value with upper tile.*/ + z = 1.0 * cornerTile[j * tilesize + i]; + for (n = 0; n < tilesize; n++) { + z = z + w[tilesize + n] * columnTile[j * tilesize + n]; + } + /* Apply to upper tile.*/ + cornerTile[j * tilesize + i] = + cornerTile[j * tilesize + i] - tau * 1.0 * z; + for (n = i + 1; n < tilesize; n++) { + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - tau * w[n] * z; + } + /* Apply to lower tile.*/ + for (n = 0; n < tilesize; n++) { + columnTile[j * tilesize + n] = + columnTile[j * tilesize + n] - tau * w[tilesize + n] * z; + } + } + /* Store w*/ + for (j = 0; j < tilesize; j++) { + columnTile[i * tilesize + j] = w[tilesize + j]; + } + tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau; + } } /** @@ -455,46 +405,45 @@ void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tiles * @param ii The value of i for the QR decomposition. * @param jj The value of j for the QR decomposition. * @param kk The value of k for the QR decomposition. - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w) -{ - int i, j, n; - double z; - - - for(i = 0; i < tilesize; i++) - { - for(j = 0; j < i; j++) - w[j] = 0.0; - w[i] = 1.0; - for(j = i+1; j < tilesize; j++) - w[j] = 0.0; - for(j = 0; j < tilesize; j++) - w[j+tilesize] = columnTile[i*tilesize +j]; - - /* Apply householder vector (w) to the tiles.*/ - for(j = 0; j < tilesize; j++) - { - z = 0.0; - /* Compute w' * A_j */ - for(n = 0; n < tilesize; n++) - { - z += w[n] * rowTile[j*tilesize+n]; - z += w[n + tilesize] * cornerTile[j*tilesize+n]; - } - for(n = 0; n < tilesize; n++) - { - rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z; - cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z; - } - } +void DSSRFT(double* restrict cornerTile, double* restrict columnTile, + double* restrict rowTile, int tilesize, int ii, int jj, int kk, + double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z; + double w[2*tilesize]; + + for (i = 0; i < tilesize; i++) { + for (j = 0; j < i; j++) w[j] = 0.0; + w[i] = 1.0; + for (j = i + 1; j < tilesize; j++) w[j] = 0.0; + for (j = 0; j < tilesize; j++) + w[j + tilesize] = columnTile[i * tilesize + j]; + + /* Apply householder vector (w) to the tiles.*/ + for (j = 0; j < tilesize; j++) { + z = 0.0; + /* Compute w' * A_j */ + for (n = 0; n < tilesize; n++) { + z += w[n] * rowTile[j * tilesize + n]; + z += w[n + tilesize] * cornerTile[j * tilesize + n]; + } + for (n = 0; n < tilesize; n++) { + rowTile[j * tilesize + n] = + rowTile[j * tilesize + n] - + tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z; + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - + tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z; + } } + } +} -} - /** * @brief Computed a tiled QR factorization using QuickSched. * @@ -502,302 +451,298 @@ void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* r * @param n Number of tile columns. * @param nr_threads Number of threads to use. */ - -void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) { - - int k, j, i; - double *A, *A_orig, *tau; - struct qsched s; - qsched_task_t *tid, tid_new; - qsched_res_t *rid; - int data[3]; - ticks tic, toc_run, tot_setup, tot_run = 0; - - enum task_types { task_DGEQRF , task_DLARFT , task_DTSQRF , task_DSSRFT }; - - - /* Runner function to pass to the scheduler. */ - void runner ( int type , void *data ) { - - /* Decode the task data. */ - int *idata = (int *)data; - int i = idata[0], j = idata[1], k = idata[2]; - double ww[ 2*K ]; - - /* Decode and execute the task. */ - switch ( type ) { - case task_DGEQRF: - DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww); - break; - case task_DLARFT: - DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww); - break; - case task_DTSQRF: - DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww); - break; - case task_DSSRFT: - DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww); - break; - default: - error( "Unknown task type." ); - } - } - - - /* Allocate and fill the original matrix. */ - if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL || - ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL || - ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ) - error( "Failed to allocate matrices." ); - for ( k = 0 ; k < m * n * K * K ; k++ ) - { - if(matrix == NULL) - A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0; - else - A_orig[k] = matrix[k]; +void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { + + int k, j, i; + double* A, *A_orig, *tau; + struct qsched s; + qsched_task_t* tid, tid_new; + qsched_res_t* rid; + int data[3]; + ticks tic, toc_run, tot_setup, tot_run = 0; + + enum task_types { + task_DGEQRF, + task_DLARFT, + task_DTSQRF, + task_DSSRFT + }; + + /* Runner function to pass to the scheduler. */ + void runner(int type, void * data) { + + /* Decode the task data. */ + int* idata = (int*)data; + int i = idata[0], j = idata[1], k = idata[2]; + + /* Decode and execute the task. */ + switch (type) { + case task_DGEQRF: + DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m); + break; + case task_DLARFT: + DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau, + m); + break; + case task_DTSQRF: + DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau, + m); + break; + case task_DSSRFT: + DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K], + &A[(j * m + k) * K * K], K, i, j, k, tau, m); + break; + default: + error("Unknown task type."); } - memcpy( A , A_orig , sizeof(double) * m * n * K * K ); - bzero( tau , sizeof(double) * m * n * K ); - - /* Dump A_orig. */ - /* message( "A_orig = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n*K ; j++ ) - printf( "%.3f " , A_orig[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Initialize the scheduler. */ - qsched_init( &s , nr_threads , qsched_flag_none ); - - /* Allocate and init the task ID and resource ID matrix. */ - tic = getticks(); - if ( ( tid = (qsched_task_t *)malloc( sizeof(qsched_task_t) * m * n ) ) == NULL || - ( rid = (qsched_res_t *)malloc( sizeof(qsched_res_t) * m * n ) ) == NULL ) - error( "Failed to allocate tid/rid matrix." ); - for ( k = 0 ; k < m * n ; k++ ) { - tid[k] = qsched_task_none; - rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none ); - } - - /* Build the tasks. */ - for ( k = 0 ; k < m && k < n ; k++ ) { - - /* Add kth corner task. */ - data[0] = k; data[1] = k; data[2] = k; - tid_new = qsched_addtask( &s , task_DGEQRF , task_flag_none , data , sizeof(int)*3 , 2 ); - qsched_addlock( &s , tid_new , rid[ k*m + k ] ); - if ( tid[ k*m + k ] != -1 ) - qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); - tid[ k*m + k ] = tid_new; - - /* Add column tasks on kth row. */ - for ( j = k+1 ; j < n ; j++ ) { - data[0] = k; data[1] = j; data[2] = k; - tid_new = qsched_addtask( &s , task_DLARFT , task_flag_none , data , sizeof(int)*3 , 3 ); - qsched_addlock( &s , tid_new , rid[ j*m + k ] ); - qsched_adduse( &s , tid_new , rid[ k*m + k ] ); - qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); - if ( tid[ j*m + k ] != -1 ) - qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); - tid[ j*m + k ] = tid_new; - } - - /* For each following row... */ - for ( i = k+1 ; i < m ; i++ ) { - - /* Add the row taks for the kth column. */ - data[0] = i; data[1] = k; data[2] = k; - tid_new = qsched_addtask( &s , task_DTSQRF , task_flag_none , data , sizeof(int)*3 , 3 ); - qsched_addlock( &s , tid_new , rid[ k*m + i ] ); - qsched_adduse( &s , tid_new , rid[ k*m + k ] ); - qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new ); - if ( tid[ k*m + i ] != -1 ) - qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - tid[ k*m + i ] = tid_new; - - /* Add the inner tasks. */ - for ( j = k+1 ; j < n ; j++ ) { - data[0] = i; data[1] = j; data[2] = k; - tid_new = qsched_addtask( &s , task_DSSRFT , task_flag_none , data , sizeof(int)*3 , 5 ); - qsched_addlock( &s , tid_new , rid[ j*m + i ] ); - qsched_adduse( &s , tid_new , rid[ k*m + i ] ); - qsched_adduse( &s , tid_new , rid[ j*m + k ] ); - qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - qsched_addunlock( &s, tid[j*m+i-1], tid_new); - if ( tid[ j*m + i ] != -1 ) - qsched_addunlock( &s , tid[ j*m + i ] , tid_new ); - - tid[ j*m + i ] = tid_new; - } - - } - - } /* build the tasks. */ - tot_setup = getticks() - tic; - - /* Dump the number of tasks. */ - message( "total nr of tasks: %i." , s.count ); - message( "total nr of deps: %i." , s.count_deps ); - message( "total nr of res: %i." , s.count_res ); - message( "total nr of locks: %i." , s.count_locks ); - message( "total nr of uses: %i." , s.count_uses ); - - - /* Loop over the number of runs. */ - for ( k = 0 ; k < runs ; k++ ) { - - /* Execute the the tasks. */ - tic = getticks(); - qsched_run( &s , nr_threads , runner ); - toc_run = getticks(); - message( "%ith run took %lli ticks..." , k , toc_run - tic ); - tot_run += toc_run - tic; - - } - - - /* Dump A. */ - /* message( "A = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n*K ; j++ ) - printf( "%.3f " , A[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Dump tau. */ - /* message( "tau = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n ; j++ ) - printf( "%.3f " , tau[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Dump the tasks. */ - /* for ( k = 0 ; k < s.count ; k++ ) { - int *d = (int *)&s.data[ s.tasks[k].data ]; - printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc ); - } */ - - /* Dump the costs. */ - message( "costs: setup=%lli ticks, run=%lli ticks." , - tot_setup , tot_run/runs ); - - /* Dump the timers. */ - for ( k = 0 ; k < qsched_timer_count ; k++ ) - message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs ); - - if(matrix != NULL) - { - for(k = 0; k < m*n*K*K; k++) - matrix[k] = A[k]; + } + + /* Allocate and fill the original matrix. */ + if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL || + (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL || + (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL) + error("Failed to allocate matrices."); + for (k = 0; k < m * n * K * K; k++) { + if (matrix == NULL) + A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0; + else + A_orig[k] = matrix[k]; + } + memcpy(A, A_orig, sizeof(double) * m * n * K * K); + bzero(tau, sizeof(double) * m * n * K); + + /* Dump A_orig. */ + /* message( "A_orig = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A_orig[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Initialize the scheduler. */ + qsched_init(&s, nr_threads, qsched_flag_none); + + /* Allocate and init the task ID and resource ID matrix. */ + tic = getticks(); + if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL || + (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL) + error("Failed to allocate tid/rid matrix."); + for (k = 0; k < m * n; k++) { + tid[k] = qsched_task_none; + rid[k] = qsched_addres(&s, qsched_owner_none, qsched_res_none); + } + + /* Build the tasks. */ + for (k = 0; k < m && k < n; k++) { + + /* Add kth corner task. */ + data[0] = k; + data[1] = k; + data[2] = k; + tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data, + sizeof(int) * 3, 2); + qsched_addlock(&s, tid_new, rid[k * m + k]); + if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new); + tid[k * m + k] = tid_new; + + /* Add column tasks on kth row. */ + for (j = k + 1; j < n; j++) { + data[0] = k; + data[1] = j; + data[2] = k; + tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data, + sizeof(int) * 3, 3); + qsched_addlock(&s, tid_new, rid[j * m + k]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addunlock(&s, tid[k * m + k], tid_new); + if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new); + tid[j * m + k] = tid_new; } - - /* Test if the decomposition was correct.*/ - /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); - double *Q = computeQ(tempMatrix, m*K, K, tau, m); - double *R = getR(tempMatrix, m*K); - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K); - free(Q); - Q = tileToColumn(A_orig, m*n*K*K, m, n, K); - for(i = 0; i < m * n * K * K; i++) - { - if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995)) - printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]); - } - free(tempMatrix); - free(Q); - free(R);*/ - - /* Clean up. */ - free(A); - free(A_orig); - free(tau); - free(tid); - free(rid); - qsched_free( &s ); - + /* For each following row... */ + for (i = k + 1; i < m; i++) { + + /* Add the row taks for the kth column. */ + data[0] = i; + data[1] = k; + data[2] = k; + tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data, + sizeof(int) * 3, 3); + qsched_addlock(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); + if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); + tid[k * m + i] = tid_new; + + /* Add the inner tasks. */ + for (j = k + 1; j < n; j++) { + data[0] = i; + data[1] = j; + data[2] = k; + tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data, + sizeof(int) * 3, 5); + qsched_addlock(&s, tid_new, rid[j * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[j * m + k]); + // qsched_addunlock(&s, tid[k * m + i], tid_new); + qsched_addunlock(&s, tid[j * m + i - 1], tid_new); + if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); + + tid[j * m + i] = tid_new; + } } + } /* build the tasks. */ + tot_setup = getticks() - tic; -/* Generates a random matrix. */ -double* generateColumnMatrix(int size) -{ - double* matrix = malloc(sizeof(double)*size*size); - if(matrix == NULL) - error("Failed to allocate matrix"); - - unsigned long int m_z = 35532; - int i; - for(i = 0 ; i < size*size; i++) - { - m_z = (1664525*m_z + 1013904223) % 4294967296; - matrix[i] = m_z % 100; - if(matrix[i] < 0) - matrix[i] += 100; - } - return matrix; + /* Dump the number of tasks. */ + message("total nr of tasks: %i.", s.count); + message("total nr of deps: %i.", s.count_deps); + message("total nr of res: %i.", s.count_res); + message("total nr of locks: %i.", s.count_locks); + message("total nr of uses: %i.", s.count_uses); + + /* Loop over the number of runs. */ + for (k = 0; k < runs; k++) { + + /* Execute the the tasks. */ + tic = getticks(); + qsched_run(&s, nr_threads, runner); + toc_run = getticks(); + message("%ith run took %lli ticks...", k, toc_run - tic); + tot_run += toc_run - tic; + } + + /* Dump A. */ + /* message( "A = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Dump tau. */ + /* message( "tau = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n ; j++ ) + printf( "%.3f " , tau[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Dump the tasks. */ + /* for ( k = 0 ; k < s.count ; k++ ) { + int *d = (int *)&s.data[ s.tasks[k].data ]; + printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , + d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc ); + } */ + + /* Dump the costs. */ + message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup, + tot_run / runs); + + /* Dump the timers. */ + for (k = 0; k < qsched_timer_count; k++) + message("timer %s is %lli ticks.", qsched_timer_names[k], + s.timers[k] / runs); + + if (matrix != NULL) { + for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k]; + } + + /* Test if the decomposition was correct.*/ + /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); + double *Q = computeQ(tempMatrix, m*K, K, tau, m); + double *R = getR(tempMatrix, m*K); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, + m*K, R, m*K, 0.0, tempMatrix, m*K); + free(Q); + Q = tileToColumn(A_orig, m*n*K*K, m, n, K); + for(i = 0; i < m * n * K * K; i++) + { + if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < + 0.995)) + printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], + tempMatrix[i]); + } + free(tempMatrix); + free(Q); + free(R);*/ + + /* Clean up. */ + free(A); + free(A_orig); + free(tau); + free(tid); + free(rid); + qsched_free(&s); } +/* Generates a random matrix. */ +double* generateColumnMatrix(int size) { + double* matrix = malloc(sizeof(double) * size * size); + if (matrix == NULL) error("Failed to allocate matrix"); + + unsigned long int m_z = 35532; + int i; + for (i = 0; i < size * size; i++) { + m_z = (1664525 * m_z + 1013904223) % 4294967296; + matrix[i] = m_z % 100; + if (matrix[i] < 0) matrix[i] += 100; + } + return matrix; +} /** * @brief Main function. */ - -int main ( int argc , char *argv[] ) { - - int c, nr_threads; - int M = 4, N = 4, runs = 1, K = 32; - - /* Get the number of threads. */ - #pragma omp parallel shared(nr_threads) - { - if ( omp_get_thread_num() == 0 ) - nr_threads = omp_get_num_threads(); - } - - /* Parse the options */ - while ( ( c = getopt( argc , argv , "m:n:k:r:t:" ) ) != -1 ) - switch( c ) { - case 'm': - if ( sscanf( optarg , "%d" , &M ) != 1 ) - error( "Error parsing dimension M." ); - break; - case 'n': - if ( sscanf( optarg , "%d" , &N ) != 1 ) - error( "Error parsing dimension M." ); - break; - case 'k': - if ( sscanf( optarg , "%d" , &K ) != 1 ) - error( "Error parsing tile size." ); - break; - case 'r': - if ( sscanf( optarg , "%d" , &runs ) != 1 ) - error( "Error parsing number of runs." ); - break; - case 't': - if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) - error( "Error parsing number of threads." ); - omp_set_num_threads( nr_threads ); - break; - case '?': - fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] ); - fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n" - "matrix using nr_threads threads.\n" ); - exit( EXIT_FAILURE ); - } - - /* Dump arguments. */ - message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , - 32*M , 32*N , nr_threads , runs ); - - test_qr( M , N , K , nr_threads , runs , NULL); - + +int main(int argc, char* argv[]) { + + int c, nr_threads; + int M = 4, N = 4, runs = 1, K = 32; + +/* Get the number of threads. */ +#pragma omp parallel shared(nr_threads) + { + if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); + } + + /* Parse the options */ + while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) { + case 'm': + if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M."); + break; + case 'n': + if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M."); + break; + case 'k': + if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + omp_set_num_threads(nr_threads); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n", + argv[0]); + fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n" + "matrix using nr_threads threads.\n"); + exit(EXIT_FAILURE); } - - + + /* Dump arguments. */ + message("Computing the tiled QR decomposition of a %ix%i matrix using %i " + "threads (%i runs).", + 32 * M, 32 * N, nr_threads, runs); + + test_qr(M, N, K, nr_threads, runs, NULL); +} diff --git a/examples/test_qr_ompss.c b/examples/test_qr_ompss.c index 014d1e0d2b0227379c69917690c3e270c4179979..8d9286775f01d4ac3b8514f6a657c20f2eede68f 100644 --- a/examples/test_qr_ompss.c +++ b/examples/test_qr_ompss.c @@ -1,22 +1,21 @@ /******************************************************************************* * This file is part of QuickSched. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ - + * +* *****************************************************************************/ /* Config parameters. */ #include "../config.h" @@ -33,262 +32,378 @@ #include <lapacke.h> #include <cblas.h> - /* Local includes. */ #include "cycle.h" - /* Error macro. */ -#define error(s, ...) { fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); abort(); } +#define error(s, ...) \ + { \ + fprintf(stderr, "%s:%s():%i: " s "\n", __FILE__, __FUNCTION__, __LINE__, \ + ##__VA_ARGS__); \ + abort(); \ + } /* Message macro. */ -#define message(s, ...) { printf( "%s: " s "\n" , __FUNCTION__ , ##__VA_ARGS__ ); fflush(stdout); } - +#define message(s, ...) \ + { \ + printf("%s: " s "\n", __FUNCTION__, ##__VA_ARGS__); \ + fflush(stdout); \ + } /* Stuff to collect task data. */ struct timer { - int threadID, type; - ticks tic, toc; - }; -struct timer *timers; + int threadID, type; + ticks tic, toc; +}; +struct timer* timers; int nr_timers = 0; -/* - * Sam's routines for the tiled QR decomposition. - */ - -/* - \brief Computes 2-norm of a vector \f$x\f$ - - Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f] - */ -double do2norm(double* x, int l) -{ - double sum = 0, norm; - int i; - - for(i = 0; i < l; i++) - sum += x[i] * x[i]; +double* columnToTile(double* columnMatrix, int size, int m, int n, + int tilesize) { + double* TileMatrix; + TileMatrix = malloc(sizeof(double) * size); + if (TileMatrix == NULL) error("failed to allocate TileMatrix"); + int i, j, k, l; + + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + double* tileStart = + &columnMatrix[i * m * tilesize * tilesize + j * tilesize]; + double* tilePos = + &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + for (k = 0; k < tilesize; k++) { + tileStart = &columnMatrix[i * m * tilesize * tilesize + + k * m * tilesize + j * tilesize]; + for (l = 0; l < tilesize; l++) { + tilePos[k * tilesize + l] = tileStart[l]; + } + } + } + } - norm = sqrt(sum); + return TileMatrix; +} - return norm; +double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) { + double* ColumnMatrix; + ColumnMatrix = (double*)malloc(sizeof(double) * size); + if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix"); + int i, j, k, l; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + double* tile = + &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + double* tilePos = + &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize]; + for (k = 0; k < tilesize; k++) { + for (l = 0; l < tilesize; l++) { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[tilesize]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[tilesize * m]; + } + } + } + return ColumnMatrix; } +/* Routines for the tiled QR decomposition.*/ + /** - * \brief Computes a Householder reflector from a pair of vectors from coupled blocks * - * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks. - * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag + * @brief Computes the QR decomposition of a tile. + * + * @param cornerTile A pointer to the tile for which the decomposition is +* computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param k The value of k for the QR decomposition + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * - * \param topDiag The only non-zero element of the incoming vector in the top block - * \param ma The number of elements in the top vector - * \param xb Pointer to the lower vector - * \param l The number of elements in the whole vector - * \param vk A pointer to a pre-allocated array to store the householder vector of size l * - * \returns void */ -void calcvkDouble (double topDiag, - int ma, - double* xb, - int l, - double* vk) -{ - int sign, i; - double norm, div; - //same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable - - sign = topDiag >= 0.0 ? 1 : -1; - vk[0] = topDiag; - //use vk[0] as beta - for(i = 1; i < ma; i++) - vk[i] = 0; - - for(; i < l; i++) - vk[i] = xb[i - ma]; - - norm = do2norm(vk, l); - vk[0] += norm * sign; - - if(norm != 0.0) - { - div = 1/vk[0]; - - for(i = 1; i < l; i++) - vk[i] *= div; - } -} - - -void updateDoubleQ_WY (double* blockA, - double* blockB, - double* blockTau, - int k, int ma, int mb, int n, - int ldm, - double* hhVector)//bottom, essential part. -{ - int i, j; - - double tau = 1.0, beta; - - /* Compute tau = 2/v'v */ - for(i = 0; i < mb; i ++) - tau += hhVector[i] * hhVector[i]; - - tau = 2/tau; - - for(j = k; j < n; j ++) - { - /* Compute v'*b_j */ - beta = blockA[(j*ldm) + k]; +#pragma omp task inout(cornerTile[0]) out(tauMatrix[0]) +void DGEQRF(double* restrict cornerTile, int tileSize, + double* restrict tauMatrix, int k, int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[tileSize]; + + /* Timer stuff. */ + int ind = __sync_fetch_and_add( &nr_timers , 1 ); + timers[ind].threadID = omp_get_thread_num(); + timers[ind].type = 0; + timers[ind].tic = getticks(); + + /*Find the householder vector for each row. */ + for (i = 0; i < tileSize; i++) { + norm = 0.0; + /*Fill w with the vector.*/ + for (j = i; j < tileSize; j++) { + /* ith row is i*tileSize, only want elements on diagonal or below.*/ + w[j] = cornerTile[i * tileSize + j]; + /*Find the norm as well */ + norm = norm + w[j] * w[j]; + } + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; - /* Then for lower half */ - for(i = 0; i < mb; i ++) - beta += blockB[(j*ldm) + i] * hhVector[i]; + norm = sqrt(norm); - beta *= tau; + u1 = w[i] - sign * norm; - /* Compute b_j = b_j - beta*v_k */ - blockA[(j*ldm) + k] -= beta; - - for(i = 0; i < mb; i ++) - blockB[(j*ldm) + i] -= beta * hhVector[i]; - } + if (u1 != 0.0) { + for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1; + } else { + for (j = i + 1; j < tileSize; j++) w[j] = 0.0; + } - /* Insert vector below diagonal. */ - for(i = 0; i < mb; i ++) - blockB[(k*ldm) + i] = hhVector[i]; + if (norm != 0.0) + tau = -sign * u1 / norm; + else + tau = 0.0; - blockTau[k] = tau; -} + /*Store the below diagonal vector */ -// #pragma omp task in( blockA[0] ) inout( blockB[0] ) in( blockTau[0] ) -void DTSQRF (double* blockA, - double* blockB, - double* blockTau, - int ma, - int mb, - int n, - int ldm ) -{ - int k; - double* xVectA, *xVectB; - double hhVector[ 2*ma ]; - - int ind = __sync_fetch_and_add( &nr_timers , 1 ); - timers[ind].threadID = omp_get_thread_num(); - timers[ind].type = 2; - timers[ind].tic = getticks(); - - xVectA = blockA; - xVectB = blockB; - - for(k = 0; k < n; k++) - { - //vk = sign(x[1])||x||_2e1 + x - //vk = vk/vk[0] - calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential - - //matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na) - //update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros. - updateDoubleQ_WY (blockA, blockB, - blockTau, - k, ma, mb, n, - ldm, - hhVector + ma - k); - - xVectA += ldm + 1; - xVectB += ldm; - } - timers[ind].toc = getticks(); + for (j = i + 1; j < tileSize; j++) { + cornerTile[i * tileSize + j] = w[j]; + } + cornerTile[i * tileSize + i] = sign * norm; + w[i] = 1.0; + /* Apply the householder transformation to the rest of the tile, for + * everything to the right of the diagonal. */ + for (j = i + 1; j < tileSize; j++) { + /*Compute w'*A_j*/ + z = cornerTile[j * tileSize + i]; + for (n = i + 1; n < tileSize; n++) { + z = z + cornerTile[j * tileSize + n] * w[n]; + } + /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ + for (n = i; n < tileSize; n++) { + cornerTile[j * tileSize + n] = + cornerTile[j * tileSize + n] - tau * w[n] * z; + } + } + /* Store tau. We're on k*tileSize+ith row. kth column.*/ + tauMatrix[(k * tileSize + i) * tauNum + k] = tau; + } + timers[ind].toc = getticks(); } -// #pragma omp task in( blockV[0] ) in( blockB[0] ) inout( blockA[0] ) in( blockTau[0] ) -void DSSRFT (double* blockV, - double* blockA, double* blockB, - double* blockTau, - int b, int n, int ldm) -{ - int i, j, k; - - double tau, beta; - - int ind = __sync_fetch_and_add( &nr_timers , 1 ); - timers[ind].threadID = omp_get_thread_num(); - timers[ind].type = 3; - timers[ind].tic = getticks(); +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile for which the householder is stored. + * @param rowTiles The tile to which the householder is applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * + */ +#pragma omp task in(cornerTile[0]) inout(rowTile[0]) in(tauMatrix[0]) +void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, + int jj, int kk, double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z = 0.0; + double w[tileSize]; + + /* Timer stuff. */ + int ind = __sync_fetch_and_add( &nr_timers , 1 ); + timers[ind].threadID = omp_get_thread_num(); + timers[ind].type = 1; + timers[ind].tic = getticks(); - /* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B, - and for each householder vector v of blockV */ - - /* For each column of B */ - for(j = 0; j < n; j ++) - { - /* For each householder vector. */ - for(k = 0; k < n; k ++) - { - /* tau = 2/v'v, computed earlier, stored in T(k,k). */ - tau = blockTau[k]; - - /* Compute beta = v_k'b_j. */ - /* v_k is >0 (=1) only at position k in top half. */ - beta = blockA[(j*ldm) + k]; - - /* For lower portion of v_k, aligning with the lower block */ - for(i = 0; i < b; i ++) - beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i]; - - beta *= tau; - - /* Compute b_j = b_j - beta * v */ - /* v_k = 1 at (k) in top half again */ - blockA[(j*ldm) + k] -= beta; - - /* Apply to bottom block. */ - for(i = 0; i < b; i ++) - blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i]; - } - } - timers[ind].toc = getticks(); + /* For each row in the corner Tile*/ + for (i = 0; i < tileSize; i++) { + /*Get w for row i */ + for (j = i; j < tileSize; j++) { + w[j] = cornerTile[i * tileSize + j]; + } + w[i] = 1.0; + + /* Apply to the row Tile */ + for (j = 0; j < tileSize; j++) { + z = 0.0; + /* Below Diagonal!*/ + /*Compute w'*A_j*/ + for (n = i; n < tileSize; n++) { + z = z + w[n] * rowTile[j * tileSize + n]; + } + for (n = i; n < tileSize; n++) { + rowTile[j * tileSize + n] = + rowTile[j * tileSize + n] - + tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z; + } + } + } + timers[ind].toc = getticks(); } - /** - * @breif Wrapper to get the dependencies right. + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile The corner tile for this value of k. + * @param columnTile The tile for which householders are computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * */ - -// #pragma omp task inout( a[0] ) inout( tau[0] ) -void DGEQRF ( int matrix_order, lapack_int m, lapack_int n, - double* a, lapack_int lda, double* tau ) { - int ind = __sync_fetch_and_add( &nr_timers , 1 ); - timers[ind].threadID = omp_get_thread_num(); - timers[ind].type = 0; - timers[ind].tic = getticks(); - LAPACKE_dgeqrf( matrix_order, m, n, a, lda, tau ); - timers[ind].toc = getticks(); +#pragma omp task inout(cornerTile[0]) inout(columnTile[0]) out(tauMatrix[0]) +void DTSQRF(double* restrict cornerTile, double* restrict columnTile, + int tilesize, int ii, int kk, double* restrict tauMatrix, + int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[2*tilesize]; + + /* Timer stuff. */ + int ind = __sync_fetch_and_add( &nr_timers , 1 ); + timers[ind].threadID = omp_get_thread_num(); + timers[ind].type = 2; + timers[ind].tic = getticks(); + + /* For each column compute the householder vector. */ + for (i = 0; i < tilesize; i++) { + norm = 0.0; + w[i] = cornerTile[i * tilesize + i]; + norm = norm + w[i] * w[i]; + for (j = i + 1; j < tilesize; j++) { + w[j] = 0.0; + } + for (j = 0; j < tilesize; j++) { + w[tilesize + j] = columnTile[i * tilesize + j]; + norm = norm + w[tilesize + j] * w[tilesize + j]; } + norm = sqrt(norm); + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; + + u1 = w[i] - sign * norm; + if (u1 != 0.0) { + for (j = i + 1; j < 2 * tilesize; j++) { + w[j] = w[j] / u1; + } + } else { + for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0; + } + + if (norm != 0) + tau = -sign * u1 / norm; + else + tau = 0.0; + + /* Apply to each row to the right.*/ + for (j = i; j < tilesize; j++) { + /* Find w'*A_j, w is 0s except for first value with upper tile.*/ + z = 1.0 * cornerTile[j * tilesize + i]; + for (n = 0; n < tilesize; n++) { + z = z + w[tilesize + n] * columnTile[j * tilesize + n]; + } + /* Apply to upper tile.*/ + cornerTile[j * tilesize + i] = + cornerTile[j * tilesize + i] - tau * 1.0 * z; + for (n = i + 1; n < tilesize; n++) { + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - tau * w[n] * z; + } + /* Apply to lower tile.*/ + for (n = 0; n < tilesize; n++) { + columnTile[j * tilesize + n] = + columnTile[j * tilesize + n] - tau * w[tilesize + n] * z; + } + } + /* Store w*/ + for (j = 0; j < tilesize; j++) { + columnTile[i * tilesize + j] = w[tilesize + j]; + } + tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau; + } + timers[ind].toc = getticks(); +} /** - * @breif Wrapper to get the dependencies right. + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile to have the householder applied. + * @param columnTile The tile in which the householders are stored. + * @param rowTile The upper tile to have the householders applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * */ - -// #pragma omp task inout( v[0] ) in( tau[0] ) inout( t[0] ) -void DLARFT ( int matrix_order, char direct, char storev, - lapack_int n, lapack_int k, const double* v, - lapack_int ldv, const double* tau, double* t, - lapack_int ldt ) { - int ind = __sync_fetch_and_add( &nr_timers , 1 ); - timers[ind].threadID = omp_get_thread_num(); - timers[ind].type = 1; - timers[ind].tic = getticks(); - LAPACKE_dlarft_work( matrix_order, direct, storev, n, k, v, ldv, tau, t, ldt ); - timers[ind].toc = getticks(); +#pragma omp task inout(cornerTile[0]) in(columnTile[0]) inout(rowTile[0]) in(tauMatrix[0]) +void DSSRFT(double* restrict cornerTile, double* restrict columnTile, + double* restrict rowTile, int tilesize, int ii, int jj, int kk, + double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z; + double w[2*tilesize]; + + /* Timer stuff. */ + int ind = __sync_fetch_and_add( &nr_timers , 1 ); + timers[ind].threadID = omp_get_thread_num(); + timers[ind].type = 3; + timers[ind].tic = getticks(); + + for (i = 0; i < tilesize; i++) { + for (j = 0; j < i; j++) w[j] = 0.0; + w[i] = 1.0; + for (j = i + 1; j < tilesize; j++) w[j] = 0.0; + for (j = 0; j < tilesize; j++) + w[j + tilesize] = columnTile[i * tilesize + j]; + + /* Apply householder vector (w) to the tiles.*/ + for (j = 0; j < tilesize; j++) { + z = 0.0; + /* Compute w' * A_j */ + for (n = 0; n < tilesize; n++) { + z += w[n] * rowTile[j * tilesize + n]; + z += w[n + tilesize] * cornerTile[j * tilesize + n]; + } + for (n = 0; n < tilesize; n++) { + rowTile[j * tilesize + n] = + rowTile[j * tilesize + n] - + tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z; + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - + tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z; + } } - - + } + timers[ind].toc = getticks(); +} + /** * @brief Computed a tiled QR factorization using QuickSched. * @@ -296,151 +411,140 @@ void DLARFT ( int matrix_order, char direct, char storev, * @param n Number of tile columns. * @param nr_threads Number of threads to use. */ - -void test_qr ( int m , int n , int K , int nr_threads , int runs ) { - - int k, j, i, r; - double *A, *A_orig, *tau; - ticks tic, toc_run, tot_setup, tot_run = 0; - int tid[ m*n ]; - - - /* Allocate and fill the original matrix. */ - if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL || - ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL || - ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ) - error( "Failed to allocate matrices." ); - for ( k = 0 ; k < m * n * K * K ; k++ ) - A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0; - memcpy( A , A_orig , sizeof(double) * m * n * K * K ); - bzero( tau , sizeof(double) * m * n * K ); - - /* Dump A_orig. */ - /* message( "A_orig = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n*K ; j++ ) - printf( "%.3f " , A_orig[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Loop over the number of runs. */ - for ( r = 0 ; r < runs ; r++ ) { - - /* Start the clock. */ - tic = getticks(); - nr_timers = 0; - - /* Launch the tasks. */ - for ( k = 0 ; k < m && k < n ; k++ ) { - - /* Add kth corner task. */ - #pragma omp task inout( tid[ k*m + k ] ) - DGEQRF( LAPACK_COL_MAJOR , K, K , - &A[ k*m*K*K + k*K ] , m*K , &tau[ k*m*K + k*K ] ); - - /* Add column tasks on kth row. */ - for ( j = k+1 ; j < n ; j++ ) { - #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] ) - DLARFT( LAPACK_COL_MAJOR , 'F' , 'C' , - K , K , &A[ k*m*K*K + k*K ] , - m*K , &tau[ k*m*K + k*K ] , &A[ j*m*K*K + k*K ] , - m*K ); - } - - /* For each following row... */ - for ( i = k+1 ; i < m ; i++ ) { - - /* Add the row taks for the kth column. */ - #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] ) - DTSQRF( &A[ k*m*K*K + k*K ] , &A[ k*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K , K*m ); - - /* Add the inner tasks. */ - for ( j = k+1 ; j < n ; j++ ) { - #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[ j*m + k ] ) - DSSRFT( &A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m ); - } - - } - - } /* build the tasks. */ - - /* Collect timers. */ - #pragma omp taskwait - toc_run = getticks(); - message( "%ith run took %lli ticks..." , r , toc_run - tic ); - tot_run += toc_run - tic; - + +void test_qr(int m, int n, int K, int nr_threads, int runs) { + + int k, j, i, r; + double* A, *A_orig, *tau; + ticks tic, toc_run, tot_run = 0; + + /* Allocate and fill the original matrix. */ + if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL || + (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL || + (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL) + error("Failed to allocate matrices."); + for (k = 0; k < m * n * K * K; k++) + A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0; + memcpy(A, A_orig, sizeof(double) * m * n * K * K); + bzero(tau, sizeof(double) * m * n * K); + + /* Dump A_orig. */ + /* message( "A_orig = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A_orig[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Loop over the number of runs. */ + for (r = 0; r < runs; r++) { + + /* Start the clock. */ + tic = getticks(); + nr_timers = 0; + + /* Launch the tasks. */ + for (k = 0; k < m && k < n; k++) { + + /* Add kth corner task. */ + // #pragma omp task inout( tid[ k*m + k ] ) + DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m); + + /* Add column tasks on kth row. */ + for (j = k + 1; j < n; j++) { + // #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] ) + DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau, + m); + } + + /* For each following row... */ + for (i = k + 1; i < m; i++) { + + /* Add the row taks for the kth column. */ + // #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] ) + DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau, + m); + + /* Add the inner tasks. */ + for (j = k + 1; j < n; j++) { + // #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[ + // j*m + k ] ) + DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K], + &A[(j * m + k) * K * K], K, i, j, k, tau, m); } - - /* Dump the costs. */ - message( "costs: setup=%lli ticks, run=%lli ticks." , - tot_setup , tot_run/runs ); - - /* Dump the tasks. */ - /* for ( k = 0 ; k < nr_timers ; k++ ) - printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type , timers[k].tic , timers[k].toc ); */ - - } + } + + } /* build the tasks. */ + +/* Collect timers. */ +#pragma omp taskwait + toc_run = getticks(); + message("%ith run took %lli ticks...", r, toc_run - tic); + tot_run += toc_run - tic; + } + /* Dump the costs. */ + message("costs: run=%lli ticks.", tot_run / runs); + + /* Dump the tasks. */ + /* for ( k = 0 ; k < nr_timers ; k++ ) + printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type , + timers[k].tic , timers[k].toc ); */ +} /** * @brief Main function. */ - -int main ( int argc , char *argv[] ) { - int c, nr_threads; - int M = 4, N = 4, runs = 1, K = 32; - - /* Get the number of threads. */ - #pragma omp parallel shared(nr_threads) - { - if ( omp_get_thread_num() == 0 ) - nr_threads = omp_get_num_threads(); +int main(int argc, char* argv[]) { + + int c, nr_threads; + int M = 4, N = 4, runs = 1, K = 32; + +/* Get the number of threads. */ +#pragma omp parallel shared(nr_threads) + { + if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); + } + + /* Parse the options */ + while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) { + case 'm': + if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M."); + break; + case 'n': + if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M."); + break; + case 'k': + if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + omp_set_num_threads(nr_threads); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n", + argv[0]); + fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n" + "matrix using nr_threads threads.\n"); + exit(EXIT_FAILURE); } - - /* Parse the options */ - while ( ( c = getopt( argc , argv , "m:n:k:r:t:" ) ) != -1 ) - switch( c ) { - case 'm': - if ( sscanf( optarg , "%d" , &M ) != 1 ) - error( "Error parsing dimension M." ); - break; - case 'n': - if ( sscanf( optarg , "%d" , &N ) != 1 ) - error( "Error parsing dimension M." ); - break; - case 'k': - if ( sscanf( optarg , "%d" , &K ) != 1 ) - error( "Error parsing tile size." ); - break; - case 'r': - if ( sscanf( optarg , "%d" , &runs ) != 1 ) - error( "Error parsing number of runs." ); - break; - case 't': - if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) - error( "Error parsing number of threads." ); - omp_set_num_threads( nr_threads ); - break; - case '?': - fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] ); - fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n" - "matrix using nr_threads threads.\n" ); - exit( EXIT_FAILURE ); - } - - /* Dump arguments. */ - message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , - 32*M , 32*N , nr_threads , runs ); - - /* Initialize the timers. */ - if ( ( timers = (struct timer *)malloc( sizeof(struct timer) * M*M*N ) ) == NULL ) - error( "Failed to allocate timers." ); - - test_qr( M , N , K , nr_threads , runs ); - - } - - + + /* Dump arguments. */ + message("Computing the tiled QR decomposition of a %ix%i matrix using %i " + "threads (%i runs).", + 32 * M, 32 * N, nr_threads, runs); + + /* Initialize the timers. */ + if ((timers = (struct timer*)malloc(sizeof(struct timer) * M * M * N)) == + NULL) + error("Failed to allocate timers."); + + test_qr(M, N, K, nr_threads, runs); +}