diff --git a/examples/Makefile.am b/examples/Makefile.am index 2d20cbb1630044ced0bfd7b6d87d05e38696623b..f01bdec4fbb88639d39b19f63a12ea59dcad5c09 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -34,10 +34,9 @@ test_CFLAGS = $(AM_CFLAGS) test_LDADD = ../src/.libs/libquicksched.a # Sources for test_qr -test_qr_SOURCES = test_qr.c -test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/lapack-3.5.0/lapacke/include/ -test_qr_LDFLAGS = -I/home/aidan/lapack-3.5.0/lapacke/include/ -test_qr_LDADD = ../src/.libs/libquicksched.a /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a -lblas +test_qr_SOURCES = test_qr.c /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a +test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/ATLAS/ATLAS_linux/include #-I/home/aidan/lapack-3.5.0/lapacke/include +test_qr_LDADD = ../src/.libs/libquicksched.a -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/ # Sources for test_bh #test_bh_SOURCES = test_bh.c diff --git a/examples/test_qr.c b/examples/test_qr.c index ada33a33acf4833ed478e10bcd0f0cadf466a5e5..97860917eb98eb1928a1ab37ce0568bc3f76ec38 100644 --- a/examples/test_qr.c +++ b/examples/test_qr.c @@ -1,22 +1,21 @@ /******************************************************************************* * This file is part of QuickSched. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ - + * +* *****************************************************************************/ /* Config parameters. */ #include "../config.h" @@ -30,187 +29,161 @@ #include <omp.h> #include <pthread.h> - - - #include <cblas.h> - /* Local includes. */ #include "quicksched.h" /** - * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices. - * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR. + * Takes a column major matrix, NOT tile major. size is length of a side of the + * matrix. Only works for square matrices. + * This function is simply for validation and is implemented naively as we know + * of no implementation to retrieve Q from the tiled QR. */ -double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) -{ - double* Q = malloc(sizeof(double)*size*size); - double* Qtemp = malloc(sizeof(double)*size*size); - double* w = malloc(sizeof(double)*size); - double* ww = malloc(sizeof(double)*size*size); - double* temp = malloc(sizeof(double)*size*size); - int i, k, l, j, n; - bzero(Q, sizeof(double)*size*size); - bzero(Qtemp, sizeof(double)*size*size); - bzero(ww, sizeof(double)*size*size); - for(i = 0; i < size; i++) - { - Q[i*size + i] = 1.0; - } - int numcoltile = size / tilesize; - int numrowtile = size / tilesize; - for(k = 0; k < numrowtile; k++) - { - for(l = 0; l < tilesize; l++) - { - bzero(Qtemp, sizeof(double)*size*size); - for(i = 0; i < size; i++) - { - Qtemp[i*size + i] = 1.0; - } - - - for(i = k; i < numcoltile; i++) - { - bzero(w, sizeof(double)*size); - - for(j = 0 ; j < tilesize; j++) - { - w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j]; - } - w[k*tilesize+l] = 1.0; - if(k*tilesize+l > i*tilesize) - { - for(j = 0; j < k*tilesize+l; j++) - w[j] = 0.0; - } - - - /* Compute (I - tau*w*w')' */ - for(j = 0; j < size; j++) - { - for(n = 0; n < size; n++) - { - if(j != n) - ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n]; - else - ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n]; - } - } - - /* Qtemp = Qtemp * (I-tau*w*w')' */ - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Qtemp, size, ww, size, 0.0, temp, size); - double *b = Qtemp; - Qtemp = temp; - temp = b; - } - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size); - double *b = Q; - Q = temp; - temp = b; +double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) { + double* Q = malloc(sizeof(double) * size * size); + double* Qtemp = malloc(sizeof(double) * size * size); + double* w = malloc(sizeof(double) * size); + double* ww = malloc(sizeof(double) * size * size); + double* temp = malloc(sizeof(double) * size * size); + int i, k, l, j, n; + bzero(Q, sizeof(double) * size * size); + bzero(Qtemp, sizeof(double) * size * size); + bzero(ww, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + Q[i * size + i] = 1.0; + } + int numcoltile = size / tilesize; + int numrowtile = size / tilesize; + for (k = 0; k < numrowtile; k++) { + for (l = 0; l < tilesize; l++) { + bzero(Qtemp, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + Qtemp[i * size + i] = 1.0; + } + + for (i = k; i < numcoltile; i++) { + bzero(w, sizeof(double) * size); + + for (j = 0; j < tilesize; j++) { + w[i * tilesize + j] = + HR[(k * tilesize + l) * size + i * tilesize + j]; + } + w[k * tilesize + l] = 1.0; + if (k * tilesize + l > i * tilesize) { + for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0; } + + /* Compute (I - tau*w*w')' */ + for (j = 0; j < size; j++) { + for (n = 0; n < size; n++) { + if (j != n) + ww[n * size + j] = + -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + else + ww[n * size + j] = + 1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + } + } + + /* Qtemp = Qtemp * (I-tau*w*w')' */ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Qtemp, size, ww, size, 0.0, temp, size); + double* b = Qtemp; + Qtemp = temp; + temp = b; + } + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Q, size, Qtemp, size, 0.0, temp, size); + double* b = Q; + Q = temp; + temp = b; } - - free(Qtemp); - free(w); - free(ww); - free(temp); - return Q; + } + + free(Qtemp); + free(w); + free(ww); + free(temp); + return Q; } -double* getR(double*HR, int size) -{ - double* R = malloc(sizeof(double) * size * size); - int i, j; - bzero(R, sizeof(double) * size * size); - for(i = 0; i< size; i++) - { - for(j = 0; j <= i; j++) - { - R[i*size + j] = HR[i*size + j]; - } +double* getR(double* HR, int size) { + double* R = malloc(sizeof(double) * size * size); + int i, j; + bzero(R, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + for (j = 0; j <= i; j++) { + R[i * size + j] = HR[i * size + j]; } - return R; + } + return R; } -void printMatrix(double* Matrix, int m, int n, int tilesize) -{ - int i, j; - - for(i=0; i < m*tilesize; i++) - { - for(j = 0; j < n*tilesize; j++) - { - printf(" %.3f ", Matrix[j*m*tilesize +i]); +void printMatrix(double* Matrix, int m, int n, int tilesize) { + int i, j; - } - printf("\n"); + for (i = 0; i < m * tilesize; i++) { + for (j = 0; j < n * tilesize; j++) { + printf(" %.3f ", Matrix[j * m * tilesize + i]); } - + printf("\n"); + } } -double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize) -{ - double* TileMatrix; - TileMatrix = malloc(sizeof(double) * size ); - if(TileMatrix == NULL) - error("failed to allocate TileMatrix"); - int i,j,k,l; - - for( i = 0; i < n ; i++ ) - { - for( j = 0; j < m; j++ ) - { - double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize]; - double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize]; - for(k = 0; k < tilesize; k++) - { - tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize]; - for( l = 0; l < tilesize; l++ ) - { - tilePos[k*tilesize + l] = tileStart[l]; - } - } +double* columnToTile(double* columnMatrix, int size, int m, int n, + int tilesize) { + double* TileMatrix; + TileMatrix = malloc(sizeof(double) * size); + if (TileMatrix == NULL) error("failed to allocate TileMatrix"); + int i, j, k, l; + + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + double* tileStart = + &columnMatrix[i * m * tilesize * tilesize + j * tilesize]; + double* tilePos = + &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + for (k = 0; k < tilesize; k++) { + tileStart = &columnMatrix[i * m * tilesize * tilesize + + k * m * tilesize + j * tilesize]; + for (l = 0; l < tilesize; l++) { + tilePos[k * tilesize + l] = tileStart[l]; } + } } + } - return TileMatrix; - + return TileMatrix; } -double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize) -{ - double* ColumnMatrix; - ColumnMatrix = (double*) malloc(sizeof(double) * size ); - if(ColumnMatrix == NULL) - error("failed to allocate ColumnMatrix"); - int i,j,k,l; - for( i = 0; i < n ; i++ ) - { - for(j = 0; j < m ; j++ ) - { - /* Tile on ith column is at i*m*32*32.*/ - /* Tile on jth is at j*32*32 */ - double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize]; - /* Column starts at same position as tile. */ - /* Row j*32.*/ - double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize]; - for( k = 0; k < tilesize; k++ ) - { - for(l=0; l < tilesize; l++) - { - tilePos[l] = tile[l]; - } - /* Next 32 elements are the position of the tile in the next column.*/ - tile = &tile[tilesize]; - /* Move to the j*32th position in the next column. */ - tilePos = &tilePos[tilesize*m]; - - } - } +double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) { + double* ColumnMatrix; + ColumnMatrix = (double*)malloc(sizeof(double) * size); + if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix"); + int i, j, k, l; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + double* tile = + &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + double* tilePos = + &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize]; + for (k = 0; k < tilesize; k++) { + for (l = 0; l < tilesize; l++) { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[tilesize]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[tilesize * m]; + } } - return ColumnMatrix; + } + return ColumnMatrix; } /* Routines for the tiled QR decomposition.*/ @@ -219,89 +192,80 @@ double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize * * @brief Computes the QR decomposition of a tile. * - * @param cornerTile A pointer to the tile for which the decomposition is computed. + * @param cornerTile A pointer to the tile for which the decomposition is +* computed. * @param tilesize The number of elements on a row/column of the tile. * @param tauMatrix A pointer to the tau Matrix. * @param k The value of k for the QR decomposition - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w) -{ - int i, j, n; - double norm=0.0, sign, u1, tau, z; - - - /*Find the householder vector for each row. */ - for(i = 0; i < tileSize; i++) - { - norm = 0.0; - /*Fill w with the vector.*/ - for(j=i; j < tileSize; j++) - { - /* ith row is i*tileSize, only want elements on diagonal or below.*/ - w[ j ] = cornerTile[ i*tileSize +j ]; - /*Find the norm as well */ - norm = norm + w[j]*w[j]; - } - if(w[i] >= 0.0) - sign = -1; - else - sign = 1; - - norm = sqrt(norm); - - u1 = w[i] - sign*norm; - - if(u1 != 0.0) - { - for(j=i+1; j < tileSize; j++) - w[j] = w[j] / u1; - } - else - { - for(j=i+1; j < tileSize; j++) - w[j] = 0.0; - } - - if(norm != 0.0) - tau = -sign*u1/norm; - else - tau = 0.0; +void DGEQRF(double* restrict cornerTile, int tileSize, + double* restrict tauMatrix, int k, int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[tileSize]; + + /*Find the householder vector for each row. */ + for (i = 0; i < tileSize; i++) { + norm = 0.0; + /*Fill w with the vector.*/ + for (j = i; j < tileSize; j++) { + /* ith row is i*tileSize, only want elements on diagonal or below.*/ + w[j] = cornerTile[i * tileSize + j]; + /*Find the norm as well */ + norm = norm + w[j] * w[j]; + } + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; + + norm = sqrt(norm); + u1 = w[i] - sign * norm; +// if(i==1) + // printf("%.3f\n", u1); + if (u1 != 0.0) { + for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1; + } else { + for (j = i + 1; j < tileSize; j++) w[j] = 0.0; + } - /*Store the below diagonal vector */ - - for(j = i+1; j < tileSize; j++) - { - cornerTile[ i*tileSize +j ] = w[j]; - } - cornerTile[ i*tileSize + i ] = sign*norm; - w[i] = 1.0; - /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */ - for(j = i+1; j < tileSize; j++) - { - /*Compute w'*A_j*/ - z = cornerTile[ j*tileSize+i]; - for(n = i+1; n < tileSize; n++) - { - z = z + cornerTile[j*tileSize+n] * w[n]; - } - /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ - for(n=i; n < tileSize; n++) - { - cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z; - } + if (norm != 0.0) + tau = -sign * u1 / norm; + else + tau = 0.0; + + // printf("%.3f\n", sign); - } - /* Store tau. We're on k*tileSize+ith row. kth column.*/ - tauMatrix[(k*tileSize+i)*tauNum+k] = tau; + /*Store the below diagonal vector */ + for (j = i + 1; j < tileSize; j++) { + cornerTile[i * tileSize + j] = w[j]; } - + cornerTile[i * tileSize + i] = sign * norm; + w[i] = 1.0; + /* Apply the householder transformation to the rest of the tile, for + * everything to the right of the diagonal. */ + for (j = i + 1; j < tileSize; j++) { + /*Compute w'*A_j*/ + z = cornerTile[j * tileSize + i]; + for (n = i + 1; n < tileSize; n++) { + z = z + cornerTile[j * tileSize + n] * w[n]; + } + /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ + for (n = i; n < tileSize; n++) { + cornerTile[j * tileSize + n] = + cornerTile[j * tileSize + n] - tau * w[n] * z; + } + } + /* Store tau. We're on k*tileSize+ith row. kth column.*/ + tauMatrix[(k * tileSize + i) * tauNum + k] = tau; + } } - /** * * @brief Applies the householder factorisation of the corner to the row tile. @@ -312,46 +276,39 @@ void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatri * @param tauMatrix A pointer to the tau Matrix. * @param jj The value of j for the QR decomposition. * @param kk The value of k for the QR decomposition. - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w) -{ - int i, j, n; - double z=0.0; - - - /* For each row in the corner Tile*/ - for(i = 0; i < tileSize; i++) - { - /*Get w for row i */ - for(j = i; j < tileSize; j++) - { - w[j] = cornerTile[i*tileSize + j]; - } - w[i] = 1.0; - - /* Apply to the row Tile */ - for(j = 0; j < tileSize; j++) - { - z=0.0; - /* Below Diagonal!*/ - /*Compute w'*A_j*/ - for(n = i; n < tileSize; n++) - { - z = z + w[n] * rowTile[j*tileSize+n]; - } - for(n = i; n < tileSize; n++) - { - rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; - } - } - - +void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, + int jj, int kk, double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z = 0.0; + double w[tileSize]; + + /* For each row in the corner Tile*/ + for (i = 0; i < tileSize; i++) { + /*Get w for row i */ + for (j = i; j < tileSize; j++) { + w[j] = cornerTile[i * tileSize + j]; } - - + w[i] = 1.0; + /* Apply to the row Tile */ + for (j = 0; j < tileSize; j++) { + z = 0.0; + /* Below Diagonal!*/ + /*Compute w'*A_j*/ + for (n = i; n < tileSize; n++) { + z = z + w[n] * rowTile[j * tileSize + n]; + } + for (n = i; n < tileSize; n++) { + rowTile[j * tileSize + n] = + rowTile[j * tileSize + n] - + tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z; + } + } + } } /** @@ -364,83 +321,78 @@ void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, * @param tauMatrix A pointer to the tau Matrix. * @param ii The value of i for the QR decomposition. * @param kk The value of k for the QR decomposition. - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w ) -{ - int i, j, n; - double norm=0.0, sign, u1, tau, z; - - /* For each column compute the householder vector. */ - for(i = 0; i < tilesize; i++) - { - norm = 0.0; - w[i] = cornerTile[ i*tilesize+i ]; - norm = norm + w[i]*w[i]; - for(j = i+1; j < tilesize; j++) - { - w[j] = 0.0; - } - for(j = 0; j < tilesize; j++) - { - w[tilesize+j] = columnTile[ i*tilesize+j ]; - norm = norm + w[tilesize+j]*w[tilesize+j]; - } - - norm = sqrt(norm); - if(w[i] >= 0.0) - sign = -1; - else - sign = 1; - - - u1 = w[i] - sign*norm; - if(u1 != 0.0) - { - for(j = i+1; j < 2*tilesize; j++){ - w[j] = w[j]/u1; - } - }else - { - for(j = i+1; j < 2*tilesize; j++) - w[j] = 0.0; - } +void DTSQRF(double* restrict cornerTile, double* restrict columnTile, + int tilesize, int ii, int kk, double* restrict tauMatrix, + int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[2*tilesize]; + + /* For each column compute the householder vector. */ + for (i = 0; i < tilesize; i++) { + norm = 0.0; + w[i] = cornerTile[i * tilesize + i]; + norm = norm + w[i] * w[i]; + for (j = i + 1; j < tilesize; j++) { + w[j] = 0.0; + } + for (j = 0; j < tilesize; j++) { + w[tilesize + j] = columnTile[i * tilesize + j]; + norm = norm + w[tilesize + j] * w[tilesize + j]; + } - if(norm != 0) - tau = -sign*u1/norm; - else - tau = 0.0; - - /* Apply to each row to the right.*/ - for(j = i; j < tilesize; j++) - { - /* Find w'*A_j, w is 0s except for first value with upper tile.*/ - z = 1.0 * cornerTile[ j*tilesize+i ]; - for(n = 0; n < tilesize; n++) - { - z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ]; - } - /* Apply to upper tile.*/ - cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z; - for(n = i+1; n < tilesize; n++) - { - cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z; - } - /* Apply to lower tile.*/ - for(n = 0; n < tilesize; n++) - { - columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z; - } - - } - /* Store w*/ - for(j = 0; j < tilesize; j++){ - columnTile[ i*tilesize+j ] = w[tilesize+j]; - } - tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau; + norm = sqrt(norm); +// printf("%.3f\n", norm); + + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; + u1 = w[i] - sign * norm; + if (u1 != 0.0) { + for (j = i + 1; j < 2 * tilesize; j++) { + w[j] = w[j] / u1; + } + } else { + for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0; + } + + if (norm != 0) + tau = -sign * u1 / norm; + else + tau = 0.0; + + /* Apply to each row to the right.*/ + for (j = i; j < tilesize; j++) { + /* Find w'*A_j, w is 0s except for first value with upper tile.*/ + z = 1.0 * cornerTile[j * tilesize + i]; + for (n = 0; n < tilesize; n++) { + z = z + w[tilesize + n] * columnTile[j * tilesize + n]; + } + /* Apply to upper tile.*/ + cornerTile[j * tilesize + i] = + cornerTile[j * tilesize + i] - tau * 1.0 * z; + for (n = i + 1; n < tilesize; n++) { + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - tau * w[n] * z; + } + /* Apply to lower tile.*/ + for (n = 0; n < tilesize; n++) { + columnTile[j * tilesize + n] = + columnTile[j * tilesize + n] - tau * w[tilesize + n] * z; + } } + /* Store w*/ + for (j = 0; j < tilesize; j++) { + columnTile[i * tilesize + j] = w[tilesize + j]; + } + tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau; + } } /** @@ -455,46 +407,45 @@ void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tiles * @param ii The value of i for the QR decomposition. * @param jj The value of j for the QR decomposition. * @param kk The value of k for the QR decomposition. - * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). * * */ -void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w) -{ - int i, j, n; - double z; - - - for(i = 0; i < tilesize; i++) - { - for(j = 0; j < i; j++) - w[j] = 0.0; - w[i] = 1.0; - for(j = i+1; j < tilesize; j++) - w[j] = 0.0; - for(j = 0; j < tilesize; j++) - w[j+tilesize] = columnTile[i*tilesize +j]; - - /* Apply householder vector (w) to the tiles.*/ - for(j = 0; j < tilesize; j++) - { - z = 0.0; - /* Compute w' * A_j */ - for(n = 0; n < tilesize; n++) - { - z += w[n] * rowTile[j*tilesize+n]; - z += w[n + tilesize] * cornerTile[j*tilesize+n]; - } - for(n = 0; n < tilesize; n++) - { - rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z; - cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z; - } - } +void DSSRFT(double* restrict cornerTile, double* restrict columnTile, + double* restrict rowTile, int tilesize, int ii, int jj, int kk, + double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z; + double w[2*tilesize]; + + for (i = 0; i < tilesize; i++) { + for (j = 0; j < i; j++) w[j] = 0.0; + w[i] = 1.0; + for (j = i + 1; j < tilesize; j++) w[j] = 0.0; + for (j = 0; j < tilesize; j++) + w[j + tilesize] = columnTile[i * tilesize + j]; + + /* Apply householder vector (w) to the tiles.*/ + for (j = 0; j < tilesize; j++) { + z = 0.0; + /* Compute w' * A_j */ + for (n = 0; n < tilesize; n++) { + z += w[n] * rowTile[j * tilesize + n]; + z += w[n + tilesize] * cornerTile[j * tilesize + n]; + } + for (n = 0; n < tilesize; n++) { + rowTile[j * tilesize + n] = + rowTile[j * tilesize + n] - + tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z; + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - + tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z; + } } + } +} -} - /** * @brief Computed a tiled QR factorization using QuickSched. * @@ -502,302 +453,385 @@ void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* r * @param n Number of tile columns. * @param nr_threads Number of threads to use. */ - -void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) { - - int k, j, i; - double *A, *A_orig, *tau; - struct qsched s; - qsched_task_t *tid, tid_new; - qsched_res_t *rid; - int data[3]; - ticks tic, toc_run, tot_setup, tot_run = 0; - - enum task_types { task_DGEQRF , task_DLARFT , task_DTSQRF , task_DSSRFT }; - - /* Runner function to pass to the scheduler. */ - void runner ( int type , void *data ) { - - /* Decode the task data. */ - int *idata = (int *)data; - int i = idata[0], j = idata[1], k = idata[2]; - double ww[ 2*K ]; - - /* Decode and execute the task. */ - switch ( type ) { - case task_DGEQRF: - DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww); - break; - case task_DLARFT: - DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww); - break; - case task_DTSQRF: - DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww); - break; - case task_DSSRFT: - DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww); - break; - default: - error( "Unknown task type." ); - } - - } - - - /* Allocate and fill the original matrix. */ - if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL || - ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL || - ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ) - error( "Failed to allocate matrices." ); - for ( k = 0 ; k < m * n * K * K ; k++ ) - { - if(matrix == NULL) - A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0; - else - A_orig[k] = matrix[k]; +void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { + + int k, j, i; + double* A, *A_orig, *tau; + struct qsched s; + qsched_task_t* tid, tid_new; + qsched_res_t* rid; + int data[3]; + ticks tic, toc_run, tot_setup, tot_run = 0; + + enum task_types { + task_DGEQRF, + task_DLARFT, + task_DTSQRF, + task_DSSRFT + }; + + /* Runner function to pass to the scheduler. */ + void runner(int type, void * data) { + + /* Decode the task data. */ + int* idata = (int*)data; + int i = idata[0], j = idata[1], k = idata[2]; + + /* Decode and execute the task. */ + switch (type) { + case task_DGEQRF: + DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m); + break; + case task_DLARFT: + DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau, + m); + break; + case task_DTSQRF: + DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau, + m); + break; + case task_DSSRFT: + DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K], + &A[(j * m + k) * K * K], K, i, j, k, tau, m); + break; + default: + error("Unknown task type."); } - memcpy( A , A_orig , sizeof(double) * m * n * K * K ); - bzero( tau , sizeof(double) * m * n * K ); - - /* Dump A_orig. */ - /* message( "A_orig = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n*K ; j++ ) - printf( "%.3f " , A_orig[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Initialize the scheduler. */ - qsched_init( &s , nr_threads , qsched_flag_none ); - - /* Allocate and init the task ID and resource ID matrix. */ - tic = getticks(); - if ( ( tid = (qsched_task_t *)malloc( sizeof(qsched_task_t) * m * n ) ) == NULL || - ( rid = (qsched_res_t *)malloc( sizeof(qsched_res_t) * m * n ) ) == NULL ) - error( "Failed to allocate tid/rid matrix." ); - for ( k = 0 ; k < m * n ; k++ ) { - tid[k] = qsched_task_none; - rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none ); - } - - /* Build the tasks. */ - for ( k = 0 ; k < m && k < n ; k++ ) { - - /* Add kth corner task. */ - data[0] = k; data[1] = k; data[2] = k; - tid_new = qsched_addtask( &s , task_DGEQRF , task_flag_none , data , sizeof(int)*3 , 2 ); - qsched_addlock( &s , tid_new , rid[ k*m + k ] ); - if ( tid[ k*m + k ] != -1 ) - qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); - tid[ k*m + k ] = tid_new; - - /* Add column tasks on kth row. */ - for ( j = k+1 ; j < n ; j++ ) { - data[0] = k; data[1] = j; data[2] = k; - tid_new = qsched_addtask( &s , task_DLARFT , task_flag_none , data , sizeof(int)*3 , 3 ); - qsched_addlock( &s , tid_new , rid[ j*m + k ] ); - qsched_adduse( &s , tid_new , rid[ k*m + k ] ); - qsched_addunlock( &s , tid[ k*m + k ] , tid_new ); - if ( tid[ j*m + k ] != -1 ) - qsched_addunlock( &s , tid[ j*m + k ] , tid_new ); - tid[ j*m + k ] = tid_new; - } - - /* For each following row... */ - for ( i = k+1 ; i < m ; i++ ) { - - /* Add the row taks for the kth column. */ - data[0] = i; data[1] = k; data[2] = k; - tid_new = qsched_addtask( &s , task_DTSQRF , task_flag_none , data , sizeof(int)*3 , 3 ); - qsched_addlock( &s , tid_new , rid[ k*m + i ] ); - qsched_adduse( &s , tid_new , rid[ k*m + k ] ); - qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new ); - if ( tid[ k*m + i ] != -1 ) - qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - tid[ k*m + i ] = tid_new; - - /* Add the inner tasks. */ - for ( j = k+1 ; j < n ; j++ ) { - data[0] = i; data[1] = j; data[2] = k; - tid_new = qsched_addtask( &s , task_DSSRFT , task_flag_none , data , sizeof(int)*3 , 5 ); - qsched_addlock( &s , tid_new , rid[ j*m + i ] ); - qsched_adduse( &s , tid_new , rid[ k*m + i ] ); - qsched_adduse( &s , tid_new , rid[ j*m + k ] ); - qsched_addunlock( &s , tid[ k*m + i ] , tid_new ); - qsched_addunlock( &s, tid[j*m+i-1], tid_new); - if ( tid[ j*m + i ] != -1 ) - qsched_addunlock( &s , tid[ j*m + i ] , tid_new ); - - tid[ j*m + i ] = tid_new; - } - - } - - } /* build the tasks. */ - tot_setup = getticks() - tic; - - /* Dump the number of tasks. */ - message( "total nr of tasks: %i." , s.count ); - message( "total nr of deps: %i." , s.count_deps ); - message( "total nr of res: %i." , s.count_res ); - message( "total nr of locks: %i." , s.count_locks ); - message( "total nr of uses: %i." , s.count_uses ); - - - /* Loop over the number of runs. */ - for ( k = 0 ; k < runs ; k++ ) { - - /* Execute the the tasks. */ - tic = getticks(); - qsched_run( &s , nr_threads , runner ); - toc_run = getticks(); - message( "%ith run took %lli ticks..." , k , toc_run - tic ); - tot_run += toc_run - tic; - - } - - - /* Dump A. */ - /* message( "A = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n*K ; j++ ) - printf( "%.3f " , A[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Dump tau. */ - /* message( "tau = [" ); - for ( k = 0 ; k < m*K ; k++ ) { - for ( j = 0 ; j < n ; j++ ) - printf( "%.3f " , tau[ j*m*K + k ] ); - printf( "\n" ); - } - printf( "];\n" ); */ - - /* Dump the tasks. */ - /* for ( k = 0 ; k < s.count ; k++ ) { - int *d = (int *)&s.data[ s.tasks[k].data ]; - printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc ); - } */ - - /* Dump the costs. */ - message( "costs: setup=%lli ticks, run=%lli ticks." , - tot_setup , tot_run/runs ); - - /* Dump the timers. */ - for ( k = 0 ; k < qsched_timer_count ; k++ ) - message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs ); - - if(matrix != NULL) - { - for(k = 0; k < m*n*K*K; k++) - matrix[k] = A[k]; + } + + /* Allocate and fill the original matrix. */ + if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL || + (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL || + (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL) + error("Failed to allocate matrices."); + for (k = 0; k < m * n * K * K; k++) { + if (matrix == NULL) + A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0; + else + A_orig[k] = matrix[k]; + } + memcpy(A, A_orig, sizeof(double) * m * n * K * K); + bzero(tau, sizeof(double) * m * n * K); + + /* Dump A_orig. */ + /* message( "A_orig = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A_orig[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Initialize the scheduler. */ + qsched_init(&s, nr_threads, qsched_flag_none); + + /* Allocate and init the task ID and resource ID matrix. */ + tic = getticks(); + if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL || + (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL) + error("Failed to allocate tid/rid matrix."); + for (k = 0; k < m * n; k++) { + tid[k] = qsched_task_none; + rid[k] = qsched_addres(&s, qsched_owner_none, qsched_res_none, NULL, 0); + } + + /* Build the tasks. */ + for (k = 0; k < m && k < n; k++) { + + /* Add kth corner task. */ + data[0] = k; + data[1] = k; + data[2] = k; + tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data, + sizeof(int) * 3, 2); + qsched_addlock(&s, tid_new, rid[k * m + k]); + if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new); + tid[k * m + k] = tid_new; + + /* Add column tasks on kth row. */ + for (j = k + 1; j < n; j++) { + data[0] = k; + data[1] = j; + data[2] = k; + tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data, + sizeof(int) * 3, 3); + qsched_addlock(&s, tid_new, rid[j * m + k]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addunlock(&s, tid[k * m + k], tid_new); + if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new); + tid[j * m + k] = tid_new; } - - /* Test if the decomposition was correct.*/ - /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); - double *Q = computeQ(tempMatrix, m*K, K, tau, m); - double *R = getR(tempMatrix, m*K); - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K); - free(Q); - Q = tileToColumn(A_orig, m*n*K*K, m, n, K); - for(i = 0; i < m * n * K * K; i++) - { - if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995)) - printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]); - } - free(tempMatrix); - free(Q); - free(R);*/ - - /* Clean up. */ - free(A); - free(A_orig); - free(tau); - free(tid); - free(rid); - qsched_free( &s ); - + /* For each following row... */ + for (i = k + 1; i < m; i++) { + + /* Add the row taks for the kth column. */ + data[0] = i; + data[1] = k; + data[2] = k; + tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data, + sizeof(int) * 3, 3); + qsched_addlock(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); + if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); + tid[k * m + i] = tid_new; + + /* Add the inner tasks. */ + for (j = k + 1; j < n; j++) { + data[0] = i; + data[1] = j; + data[2] = k; + tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data, + sizeof(int) * 3, 5); + qsched_addlock(&s, tid_new, rid[j * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[j * m + k]); + // qsched_addunlock(&s, tid[k * m + i], tid_new); + qsched_addunlock(&s, tid[j * m + i - 1], tid_new); + if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); + + tid[j * m + i] = tid_new; + } } + } /* build the tasks. */ + tot_setup = getticks() - tic; + + /* Dump the number of tasks. */ + message("total nr of tasks: %i.", s.count); + message("total nr of deps: %i.", s.count_deps); + message("total nr of res: %i.", s.count_res); + message("total nr of locks: %i.", s.count_locks); + message("total nr of uses: %i.", s.count_uses); + + /* Loop over the number of runs. */ + for (k = 0; k < runs; k++) { + + /* Execute the the tasks. */ + tic = getticks(); + qsched_run(&s, nr_threads, runner); + toc_run = getticks(); + message("%ith run took %lli ticks...", k, toc_run - tic); + tot_run += toc_run - tic; + } + + /* Dump A. */ + /* message( "A = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n*K ; j++ ) + printf( "%.3f " , A[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Dump tau. */ + /* message( "tau = [" ); + for ( k = 0 ; k < m*K ; k++ ) { + for ( j = 0 ; j < n ; j++ ) + printf( "%.3f " , tau[ j*m*K + k ] ); + printf( "\n" ); + } + printf( "];\n" ); */ + + /* Dump the tasks. */ + /* for ( k = 0 ; k < s.count ; k++ ) { + int *d = (int *)&s.data[ s.tasks[k].data ]; + printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , + d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc ); + } */ + + /* Dump the costs. */ + message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup, + tot_run / runs); + + /* Dump the timers. */ + for (k = 0; k < qsched_timer_count; k++) + message("timer %s is %lli ticks.", qsched_timer_names[k], + s.timers[k] / runs); + + if (matrix != NULL) { + for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k]; + } + + /* Test if the decomposition was correct.*/ + /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K); + double *Q = computeQ(tempMatrix, m*K, K, tau, m); + double *R = getR(tempMatrix, m*K); + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, + m*K, R, m*K, 0.0, tempMatrix, m*K); + free(Q); + Q = tileToColumn(A_orig, m*n*K*K, m, n, K); + for(i = 0; i < m * n * K * K; i++) + { + if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < + 0.995)) + printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], + tempMatrix[i]); + } + free(tempMatrix); + free(Q); + free(R);*/ + + /* Clean up. */ + free(A); + free(A_orig); + free(tau); + free(tid); + free(rid); + qsched_free(&s); +} /* Generates a random matrix. */ -double* generateColumnMatrix(int size) +double* generateColumnMatrix(int size, unsigned long int m_z) { + double* matrix = malloc(sizeof(double) * size * size); + if (matrix == NULL) error("Failed to allocate matrix"); + + int i; + for (i = 0; i < size * size; i++) { + m_z = (1664525 * m_z + 1013904223) % 4294967296; + matrix[i] = m_z % 100; + if (matrix[i] < 0) matrix[i] += 100; + } + return matrix; +} + + +/* Unit tests for the tiled QR decomposition.*/ +void test_SSSRFT() { - double* matrix = malloc(sizeof(double)*size*size); - if(matrix == NULL) - error("Failed to allocate matrix"); - - unsigned long int m_z = 35532; - int i; - for(i = 0 ; i < size*size; i++) - { - m_z = (1664525*m_z + 1013904223) % 4294967296; - matrix[i] = m_z % 100; - if(matrix[i] < 0) - matrix[i] += 100; - } - return matrix; + double* cornerTile, *rowTile, *columnTile, *otherTile; + double tau[1024]; + cornerTile = generateColumnMatrix(32, 35136); + rowTile = generateColumnMatrix(32, 35239); + columnTile = generateColumnMatrix(32, 35339); + otherTile = generateColumnMatrix(32, 35739); +DGEQRF(cornerTile, 32, tau, 0, 2); + DTSQRF(cornerTile, columnTile, + 32, 1, 0, tau, + 2); + DLARFT(cornerTile,rowTile, 32, + 1, 0, tau, 2); + DSSRFT(otherTile, columnTile, + rowTile, 32, 1, 1, 0, + tau, 2); + printMatrix(cornerTile, 1, 1, 32); + printf("\n\n\n\n\n Column Tile \n\n\n\n\n"); + printMatrix(columnTile, 1, 1, 32); + printf("\n\n\n\n\n Row Tile \n\n\n\n\n"); + printMatrix(rowTile, 1, 1, 32); + printf("\n\n\n\n\n Other Tile \n\n\n\n\n"); + printMatrix(otherTile, 1, 1, 32); + free(cornerTile); + free(rowTile); + free(columnTile); + free(otherTile); + +} +void test_STSQRF() +{ + double *cornerTile, *columnTile; + double tau[1024]; + cornerTile = generateColumnMatrix(32, 35531); + columnTile = generateColumnMatrix(32, 35585); + DGEQRF(cornerTile, 32, tau, 0, 2); + DTSQRF(cornerTile, columnTile, + 32, 1, 0, tau, + 2); + printMatrix(cornerTile, 1, 1, 32); + printf("\n\n\n\n\n Column Tile \n\n\n\n\n"); + printMatrix(columnTile, 1, 1, 32); + free(cornerTile); + free(columnTile); +} +void test_SLARFT() +{ + double* cornerTile, *rowTile; + double tau[1024]; + cornerTile = generateColumnMatrix(32, 35536); + rowTile = generateColumnMatrix(32, 35539); + DGEQRF(cornerTile, 32,tau, 0, 2); + DLARFT(cornerTile,rowTile, 32, + 1, 0, tau, 2); + printMatrix(cornerTile, 1, 1, 32); + + printf("\n\n\n\n\n Row Tile \n\n\n\n\n"); + printMatrix(rowTile, 1, 1, 32); + free(cornerTile); + free(rowTile); } +void test_SGEQRF() +{ + double* cornerTile; + double tau[1024]; + cornerTile = generateColumnMatrix(32, 35532); + DGEQRF( cornerTile, 32, + tau, 0, 2); + printMatrix(cornerTile, 1, 1, 32); + free(cornerTile); +} +void runTests() +{ + // test_SGEQRF(); +// test_SLARFT(); +// test_STSQRF(); + test_SSSRFT(); +} + + /** * @brief Main function. */ - -int main ( int argc , char *argv[] ) { - int c, nr_threads; - int M = 4, N = 4, runs = 1, K = 32; - - /* Get the number of threads. */ - #pragma omp parallel shared(nr_threads) - { - if ( omp_get_thread_num() == 0 ) - nr_threads = omp_get_num_threads(); +int main(int argc, char* argv[]) { + + int c, nr_threads; + int M = 4, N = 4, runs = 1, K = 32; + +/* Get the number of threads. */ +#pragma omp parallel shared(nr_threads) + { + if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); + } + + /* Parse the options */ + while ((c = getopt(argc, argv, "m:n:k:r:t:D:")) != -1) switch (c) { + case 'm': + if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M."); + break; + case 'n': + if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M."); + break; + case 'k': + if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + omp_set_num_threads(nr_threads); + break; + case 'D': + runTests(); + exit(EXIT_SUCCESS); + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n", + argv[0]); + fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n" + "matrix using nr_threads threads.\n"); + exit(EXIT_FAILURE); } - - /* Parse the options */ - while ( ( c = getopt( argc , argv , "m:n:k:r:t:" ) ) != -1 ) - switch( c ) { - case 'm': - if ( sscanf( optarg , "%d" , &M ) != 1 ) - error( "Error parsing dimension M." ); - break; - case 'n': - if ( sscanf( optarg , "%d" , &N ) != 1 ) - error( "Error parsing dimension M." ); - break; - case 'k': - if ( sscanf( optarg , "%d" , &K ) != 1 ) - error( "Error parsing tile size." ); - break; - case 'r': - if ( sscanf( optarg , "%d" , &runs ) != 1 ) - error( "Error parsing number of runs." ); - break; - case 't': - if ( sscanf( optarg , "%d" , &nr_threads ) != 1 ) - error( "Error parsing number of threads." ); - omp_set_num_threads( nr_threads ); - break; - case '?': - fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] ); - fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n" - "matrix using nr_threads threads.\n" ); - exit( EXIT_FAILURE ); - } - - /* Dump arguments. */ - message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." , - 32*M , 32*N , nr_threads , runs ); - - test_qr( M , N , K , nr_threads , runs , NULL); - - } - - + + /* Dump arguments. */ + message("Computing the tiled QR decomposition of a %ix%i matrix using %i " + "threads (%i runs).", + 32 * M, 32 * N, nr_threads, runs); + + test_qr(M, N, K, nr_threads, runs, NULL); +} diff --git a/examples/test_qr.cu b/examples/test_qr.cu index 270c4ca35cb12113f0fed475842f8abef68d04b2..95b6ad51ea76ebe51707e5ad37bd33046bf6338b 100644 --- a/examples/test_qr.cu +++ b/examples/test_qr.cu @@ -16,11 +16,11 @@ extern "C"{ enum task_types { task_SGEQRF , task_SLARFT , task_STSQRF , task_SSSRFT} ; -#define TID threadIdx.x +//#define TID threadIdx.x #define numthreads 128 -#define tilesize 32 +//#define tilesize 32 #define cuda_maxtasks 1000000 -#define CO(x,y,ldm) (((y)*ldm) + (x)) +//#define CO(x,y,ldm) (((y)*ldm) + (x)) #define WARPS 4 __device__ float *GPU_matrix; @@ -28,6 +28,41 @@ __device__ float *GPU_tau; __device__ int cuda_m; __device__ int cuda_n; +void printMatrix(float *matrix, int m, int n, int tilesize) +{ + int i, j; + + for(i=0; i < m*tilesize; i++) + { + for(j = 0; j < n*tilesize; j++) + { + printf(" %.3f ", matrix[j*m*tilesize +i]); + + } + printf("\n"); + } + +} + +/* Generates a random matrix. */ +float* generateColumnMatrix(int size, unsigned long int m_z) +{ + float* matrix; + cudaMallocHost(&matrix, sizeof(float)*size*size); + if(matrix == NULL) + error("Failed to allocate matrix"); + + int i; + for(i = 0 ; i < size*size; i++) + { + m_z = (1664525*m_z + 1013904223) % 4294967296; + matrix[i] = m_z % 100; + if(matrix[i] < 0) + matrix[i] += 100; + } + return matrix; +} + /* * @brief Performs a reduction sum on a vector, storing the result in v[0]. @@ -47,7 +82,7 @@ __device__ void reduceSum (volatile float* sumVector, while(n > 0) { /* Use first n threads to add up to 2n elements. */ - if(TID < n) sumVector[TID] = sumVector[TID] + sumVector[TID + n]; + if(threadIdx.x < n) sumVector[threadIdx.x] = sumVector[threadIdx.x] + sumVector[threadIdx.x + n]; /* Divide n by 2 for next iteration. */ n = n >> 1; @@ -65,8 +100,8 @@ __device__ inline void reduceSumMultiWarp( float* value ) #else __shared__ float stuff[32*WARPS]; - int tid = TID%32; - int group = TID / 32; + int tid = threadIdx.x%32; + int group = threadIdx.x / 32; char n = 32 >> 1; while(n > 0 ) { @@ -81,7 +116,6 @@ __device__ inline void reduceSumMultiWarp( float* value ) } - /* Routines for the tiled QR decomposition.*/ /** @@ -96,42 +130,53 @@ __device__ inline void reduceSumMultiWarp( float* value ) * * Note: 32 threads max. */ -__device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict tauMatrix, int k, int tauNum, float* w) +__device__ void SGEQRF(volatile float* cornerTile, int tilesize,volatile float* tauMatrix, int k, int tauNum) { int i, j; float norm=0.0, sign, u1, tau, z; - int TID = threadID % 32; - int set = threadID / 32; - + int TID = threadIdx.x % 32; + //int set = threadIdx.x / 32; + float w; /*Find the householder vector for each row. */ - for(i = 0; i < tileSize; i++) + for(i = 0; i < tilesize; i++) { norm = 0.0; - if(TID > = i ) - w[ TID ] = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID]; + if(TID >= i ) + norm = cornerTile[ i*tilesize + TID ] * cornerTile[i * tilesize + TID]; else - w[ TID ] = 0.0; - reduceSumMultiWarp( &w[TID] ); - + norm = 0.0; + reduceSumMultiWarp( &norm ); + if(cornerTile[i * tilesize + i] >= 0.0) sign = -1; else sign = 1; - norm = sqrt(w[0]); - if(TID == 0) - w[0] = 0.0; + // sign = __shfl(sign, i, 32); + norm = sqrt(norm); + if(TID >= i ) + w = cornerTile[ i*tilesize + TID ]; + else + w = 0.0; + + + +// if(TID==i) + u1 = cornerTile[i*tilesize + i] - sign*norm; - u1 = cornerTile[i*tilesize + i] - sign*norm; + // __syncthreads(); +// u1 = __shfl(u1, i, 32); + // if(i==1) +// printf("%.3f\n", u1); if(u1 != 0.0) { if(TID > i) - w[ TID ] = w[TID] / u1; + w = w / u1; } else { if(TID > i) - w[ TID ] = 0.0; + w = 0.0; } if(norm != 0.0) @@ -139,29 +184,33 @@ __device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict else tau = 0.0; + //if(TID == 0) + // printf("%.3f\n", sign); + /*Store the below diagonal vector */ if(TID > i) - cornerTile[ i*tileSize + TID ] = w[TID]; - if(TID == 0) - cornerTile[ i*tileSize + i ] = sign*norm; + cornerTile[ i*tilesize + TID ] = w; + if(TID == i) + cornerTile[ i*tilesize + i ] = sign*norm; if(TID == i) - w[i] = 1.0; + w = 1.0; /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */ - for(j = i+1; j < tileSize; j++) + for(j = i+1; j < tilesize; j++) { /*Compute w'*A_j*/ if(TID >= i) - z = cornerTile[j*tileSize + TID] * w[TID]; + z = cornerTile[j*tilesize + TID] * w; else z = 0; reduceSumMultiWarp(&z); /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ if(TID >= i) - cornerTile[j*tileSize + TID] = cornerTile[j*tileSize + TID] - tau*w[TID]*z; + cornerTile[j*tilesize + TID] = cornerTile[j*tilesize + TID] - tau*w*z; } /* Store tau. We're on k*tileSize+ith row. kth column.*/ - if(TID == 0) - tauMatrix[(k*tileSize+i)*tauNum+k] = tau; + if(TID == 0){ + tauMatrix[(k*tilesize+i)*tauNum+k] = tau; + } } @@ -182,38 +231,43 @@ __device__ void SGEQRF(float* restrict cornerTile, int tileSize, float* restrict * * */ -void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, float* tauMatrix, int tauNum, float* w) +void __device__ SLARFT(volatile float* cornerTile, volatile float* rowTile, int tilesize, int jj, int kk, volatile float* tauMatrix, int tauNum) { - int i, j, n; + int i, j; float z; float w; - int TID = threadID % 32; - int set = threadID / 32; + int TID = threadIdx.x % 32; + int set = threadIdx.x / 32; /* For each row in the corner Tile*/ - for(i = 0; i < tileSize; i++) + for(i = 0; i < tilesize; i++) { /*Get w for row i */ - for(j = i; j < tileSize; j++) - { - w[j] = cornerTile[i*tileSize + j]; - } - w[i] = 1.0; + if(TID > i) + w = cornerTile[i*tilesize + TID]; + if(TID == i) + w = 1.0; + if(TID < i) + w = 0.0; /* Apply to the row Tile */ - for(j = 0; j < tileSize; j++) + for(j = set; j < tilesize; j+=(blockDim.x/32)) { z=0.0; /* Below Diagonal!*/ /*Compute w'*A_j*/ - for(n = i; n < tileSize; n++) - { - z = z + w[n] * rowTile[j*tileSize+n]; - } - for(n = i; n < tileSize; n++) - { - rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; - } + z = w * rowTile[j*tilesize+TID]; + //for(n = i; n < tileSize; n++) + //{ + // z = z + w[n] * rowTile[j*tileSize+n]; + //} + reduceSumMultiWarp(&z); + if(TID >= i) + rowTile[j*tilesize + TID] = rowTile[j*tilesize + TID] - tauMatrix[(kk*tilesize+i)*tauNum + kk] * w * z; + //for(n = i; n < tileSize; n++) + //{ + // rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z; + //} } @@ -222,526 +276,174 @@ void SLARFT(float* cornerTile, float* rowTile, int tileSize, int jj, int kk, flo } - -/* - * @brief Computes a householder vector for a single tile QR decomposition, storing the result in a shared array. - * @param matelem The TIDth element of the column to be used for calculation. - * @param hhVector SMEM The array used to calculate and store the vector. - * @param k The timestep at which this is called. i.e where the diagonal falls. - * - * Calculates a householder vector for the vector passed as elements into a shared array. - * Used only by SGEQRF_CUDA. - */ -__device__ void calcHH (float matelem, - volatile float hhVector[], - int k) -{ - /* To store the value of 1/(v[k] + sign(v[k])*||v||). */ - float localdiv; - /* Stores the sign of v[k] */ - int sign; - - /* Read vectors into shared memory from below the diagonal. */ - if(TID >= k) - hhVector[TID] = matelem; - - /* Above the diagonal, set to 0. */ - if(TID < k) - hhVector[TID] = 0; - - /* Square each element to find ||v|| = sqrt(sum(v[i]^2)) */ - hhVector[TID] *= hhVector[TID]; - - /* Reduce to find sum of squares. */ - reduceSum(hhVector, 32); - - /* Set localdiv to equal ||v|| */ - localdiv = sqrt(hhVector[0]); - - /* According to Householder algorithm in Golub & Van Loan. */ - if(localdiv != 0.0) - { - /* Load shared to communicate v[k] to all threads. */ - hhVector[TID] = matelem; - - /* Calculate the sign of v[k] locally. */ - sign = hhVector[k] >= 0 ? 1 : -1; - - /* Compute and store localdiv = (||v||)*sign(v[k]) */ - localdiv *= sign; - - /* Compute and store localdiv = v[k] + (||v||*sign(v[k])) */ - localdiv += hhVector[k]; - - /* Finally compute and store localdiv = 1/(v[k] + ||v||*sign(v[k])) */ - localdiv = 1.0f/localdiv; - } - /* If norm is 0, do not change v. */ - else - localdiv = 1.0f; - - /* Compute and store in shared memory v = v/(v[k] + ||v||*sign(v[k])) */ - if(TID < k) - hhVector[TID] = 0.0f; - /* v[k]/v[k] = 1 */ - if(TID == k) - hhVector[TID] = 1.0f; - if(TID > k) - hhVector[TID] = matelem * localdiv; -} - -/* - * @brief Applies a householder vector to the submatrix starting at (k,k), also inserts the kth tau value into tau[k]. - * @param blockCache SMEM The 1024-element array storing the full tile to which the vector will be applied. - * @param matTau The tau vector. - * @param workingVector SMEM The array used in the computation as a scratchpad. - * @param k Defines the submatrix. - * @param hhVectorelem Element of the cumulatively-stored householder vector to apply. +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile The corner tile for this value of k. + * @param columnTile The tile for which householders are computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * * - * Works by first calculating the kth tau, - * then proceeding to apply the vector to successive columns of the matrix A_j - * with the formula A_j = A_j - tau[k]*v*(v'A_j). - * The kth tau is then stored into memory. */ -__device__ void applyHH(volatile float blockCache[], - float* matTau, - volatile float workingVector[], - int k, - float hhVectorelem) +__device__ void STSQRF(volatile float* cornerTile,volatile float* columnTile, int tilesize, int ii, int kk, volatile float* tauMatrix, int tauNum ) { - float ktau, - /* Stores the value of ktau*v*v'*A_j */ - z; - - int j; - float reg; - /* Read householder vector into shared memory to calculate ktau = 2/v'v */ - reg/*workingVector[TID]*/ = hhVectorelem; - /* Square the elements to start finding v'v */ - reg/*workingVector[TID]*/ *= reg/*workingVector[TID]*/; - - /* Perform a reduction to compute v'v and store the result in v[0]. */ - //reduceSum(workingVector, 32); - reduceSumMultiWarp(®); - - /* Finally compute ktau = 2/v'v */ - ktau = 2.0 / /*workingVector[0]*/reg; - - /* For the columns of the submatrix starting at (k,k). */ - for(j = k; j < 32; j ++) - { - /* Fill the shared memory to calculate v'*A_j */ - reg/*workingVector[TID]*/ = blockCache[TID+(j*32)] * hhVectorelem; - - /* Perform reduction to compute v'*A_j, storing the result in v[0] */ - //reduceSum(workingVector, 32); - reduceSumMultiWarp(®); - - /* Compute and store z = ktau*v[tid] */ - z = hhVectorelem * ktau; - - /* Compute and store z = (v'A_j)*(ktau*v[tid]) */ - z *= /*workingVector[0]*/reg; - - /* Apply with A_j[tid] = A_j[tid] - v'A_j*ktau*v[tid] */ - blockCache[TID + (j*32)] -= z; - } - - /* Store the essential (below diagonal) portion of householder vector below the diagonal in the block at column k. */ - if(TID > k) - blockCache[TID + k*32] = hhVectorelem; - - /* Store the kth tau. */ - if(TID == 0) - matTau[k] = ktau; -} - - - - - - -__device__ void runner_cuda_SGEQRF( int i, int j, int k, volatile float workingVector[], volatile float blockCache[] ) -{ - int kk, jj, refCache; - refCache = TID; - float* matrix = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; - float* matTau = &GPU_tau[j*cuda_m*32 + i*32 ]; - for(jj = 0; jj < 32; jj ++)//load block into shared memory - { - blockCache[refCache] = matrix[refCache]; - refCache += 32; - } - - for(kk = 0; kk < 32; kk ++) - { - //calculate the kth hh vector from the kth column of the TIDth row of the matrix - calcHH (blockCache[TID + (kk*32)],//(tid, k) - workingVector, - kk); - - //calculate the application of the hhvector along row TID - applyHH (blockCache, - matTau, - workingVector, - kk, - workingVector[TID]); - } + int i, j, n; + float norm=0.0, sign, u1, tau, z; + float w, wupper; + int TID = threadIdx.x % 32; + //int set = threadIdx.x / 32; - //copy row back - refCache = TID; + /* For each column compute the householder vector. */ + for(i = 0; i < tilesize; i++) + { +// norm = 0.0; + + wupper = cornerTile[i*tilesize + i]; - for(jj = 0; jj < 32; jj ++)//unload block from shared memory - { - matrix[refCache] = blockCache[refCache]; - refCache += 32; - } - __threadfence(); + w = columnTile[i*tilesize + TID]; + norm = (w*w); + + reduceSumMultiWarp(&norm); + norm += wupper*wupper; + norm = sqrt(norm); + if(wupper >= 0.0) + sign = -1; + else + sign = 1; + u1 = wupper - sign*norm; + if(u1 != 0.0) + { + w = w / u1; + } + else + { + w = 0.0; + } -} + if(norm != 0.0) + tau = -sign*u1/norm; + else + tau = 0.0; -__device__ void runner_cuda_SLARFT (int i, int j, int k, volatile float tau[], volatile float blockCache[] ) -{ - int jj, kk, ii, - tid = TID%32, - group = TID/32; - - __shared__ volatile float groupCache[32*4]; - float* blockV = &GPU_matrix[k*cuda_m*32*32 + k*32*32]; - float* blockA = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; - float* blockTau = &GPU_tau[ i*cuda_m*32 + i*32 ]; - // cacheCol is a column in the groupCache - volatile float *cacheCol = groupCache + (group*32); - - float alpha, - belem; - - __syncthreads(); - - /* Load tau Vector */ - if(TID < 32) - tau[TID] = blockTau[TID]; - - /* Load Vectors */ - for(jj = group; jj < 32; jj += WARPS) - { - if(tid < jj) - blockCache[tid + (jj*32)] = 0.0f; - else if(tid == jj) - blockCache[tid + (jj*32)] = 1.0f; - else if(tid > jj) - blockCache[tid + (jj*32)] = blockV[tid + (jj*32)]; - //__syncthreads(); - } - __syncthreads(); + for(j = i; j < tilesize; j++) + { + z = w * columnTile[j*tilesize+TID]; + reduceSumMultiWarp(&z); + z += cornerTile[j*tilesize+i]; + if(threadIdx.x == 0) + cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i] - tau*z; - /* Compute b_j -= tau*v*v'b_j, for all vectors in blockCached V */ - for(jj = group; jj < 32; jj += WARPS) - { - /* Read the column into registers. */ - //if(group == 0) - //{ - belem = blockA[tid + (jj*32)]; - //__syncthreads(); - if(jj == 0) - printf("%.3f %i\n", belem, TID); - /* For each vector in block of vectors. */ - - for(kk = 0; kk < 32; kk++) - { - /* Compute alpha = v'*b_j */ - cacheCol[tid] = blockCache[tid + (kk*32)] * belem; - if(group == 0 && jj == 0 && kk == 0) - printf("%.3f %i %.3f %.3f\n", belem, TID , blockCache[tid+kk*32],cacheCol[tid]); - // __syncthreads(); - - if(tid < 16) - cacheCol[tid] += cacheCol[tid+16]; - // __syncthreads(); - if(tid < 8) - cacheCol[tid] += cacheCol[tid+8]; - // __syncthreads(); - - alpha = cacheCol[0]; - for(ii = 1; ii < 8; ii ++) - alpha += cacheCol[ii]; - - /* Compute alpha = tau * v_tid * alpha */ - alpha *= tau[kk]; - if(TID==0 && kk == 0) - printf("tau[kk] = %.3f\n", tau[kk]); - alpha *= blockCache[tid + (kk*32)]; - if(jj == 0 && kk == 0) - printf("%.3f\n", alpha); - /* Compute belem -= alpha */ - belem -= alpha; - //__syncthreads(); - } - blockA[tid + (jj*32)] = belem;//} - //__syncthreads(); - } - __threadfence(); + columnTile[j*tilesize+TID] = columnTile[j*tilesize+TID] - tau*w*z; + } + + columnTile[i*tilesize+TID] = w; + if(threadIdx.x == 0) + tauMatrix[(kk*tilesize+i)*tauNum+ii] = tau; + /* Apply to each row to the right.*/ +// for(j = i; j < tilesize; j++) + // { + /* Find w'*A_j, w is 0s except for first value with upper tile.*/ + // z = 1.0 * cornerTile[ j*tilesize+i ]; + // for(n = 0; n < tilesize; n++) + // { + // z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ]; +// } + /* Apply to upper tile.*/ + // cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z; + // for(n = i+1; n < tilesize; n++) + // { + // cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z; + // } + /* Apply to lower tile.*/ +// for(n = 0; n < tilesize; n++) + // { + // columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z; + // } + + // } + /* Store w*/ +// for(j = 0; j < tilesize; j++){ + // columnTile[ i*tilesize+j ] = w[tilesize+j]; + // } + // tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau; + } } -__device__ void applyOneHHVectD (float* topElem, - float* lowElem, - int k, - volatile float tau[], - volatile float blockCache[]) -{ - float alpha; - __shared__ volatile float workV[32]; - float reg; - int tid = TID % 32; - - /* Compute alpha = sum */ - if(tid == k) - reg/*workV[TID]*/ = *topElem; - if(tid != k) - reg/*workV[TID]*/ = 0.0f; - - reg/*workV[TID]*/ += *lowElem * blockCache[tid + (k*32)]; - - //reduceSum(workV, 32); - reduceSumMultiWarp(®); - - alpha = reg/*workV[0]*/; - - /* Multiply by tau */ - alpha *= tau[k]; - - /* Compute alpha *= a_tid,j*v_tid,k */ - if(tid == k) - *topElem -= alpha; - - /* For lower element. */ - alpha *= blockCache[tid + (k*32)]; - *lowElem -= alpha; -} -__device__ void calcDoubleHHWY (float topElem, - float lowElem, - int k, - volatile float blockCache[]) +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile to have the householder applied. + * @param columnTile The tile in which the householders are stored. + * @param rowTile The upper tile to have the householders applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column). + * + * + */ +__device__ void SSSRFT( volatile float* cornerTile,volatile float* columnTile,volatile float* rowTile, int tilesize, int ii, int jj, int kk,volatile float* tauMatrix, int tauNum ) { -/* Calculate v_k. */ - int sign, i; - - float topSum = 0, - lowSum = 0, - alpha; - - __shared__ volatile float first; - - if(TID == k) - first = topElem; - - topSum = first * first; - /* Sum squares of V to compute norm, using column of BlockCache as working space. */ - blockCache[TID + (k*32)] = lowElem * lowElem; - for(i = 0; i < 32; i ++) - lowSum += blockCache[i + (k*32)]; - - alpha = sqrt(topSum + lowSum); - sign = first >= 0.f ? 1.f : -1.f; - - if(alpha != 0.0f) - { - /* Zeroth element is = first + sign*norm */ - alpha = first + sign * alpha; - alpha = 1.0f/alpha; - } - else - alpha = 1.0f; + int i, j, n; + float z; + float w; + int TID = threadIdx.x % 32; + int set = threadIdx.x / 32; + float tau; - //topElem *= alpha; - lowElem *= alpha; - - blockCache[TID + (k*32)] = lowElem; -} + + for(i = 0; i < tilesize; i++) + { + + w = columnTile[i*tilesize+TID]; + tau = tauMatrix[(kk*tilesize+i)*tauNum+ii]; -__device__ void runner_cuda_STSQRF ( int i, int j, int k, volatile float tauVect[], volatile float blockCache[]) -{ - /* Idea: for each column j of (AB)^T, - apply HH vectors 0...j-1, - compute new HH vector for column j, - store essential part in blockCache - - Uses one block cache, one vector storage. */ - float* blockA = &GPU_matrix[k*cuda_m*32*32 + k*32*32]; - float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; - float* blockTau = &GPU_tau[j*cuda_m*32 + i*32]; -// int j, k, i; - int tid = TID % 32; - int group = TID/32; - float topElem, - lowElem, - tau; - - for(j = 0; j < 32; j ++) - { - if(TID < 32 ) + for(j = set; j < tilesize; j+=blockDim.x/32) { - if(TID <= j) - topElem = blockA[TID + (j*32)]; - if(TID > j) - topElem = 0.f; - - lowElem = blockB[TID + (j*32)]; - - //for(k = 0; k < j; k ++) - //{ - /* Compute b_tid,j = b_tid,j - tau*vv'*b_tid,j */ - // applyOneHHVectD (&topElem, &lowElem, - // k, - // tauVect, - // blockCache); - //} - - calcDoubleHHWY (topElem, - lowElem, - j, - blockCache); - - /* Compute new tau = 2/v'v */ - tau = 1.0f; - for(i = 0; i < 32; i ++) - tau += blockCache[i + (j*32)] * blockCache[i + (j*32)]; - tau = 2.0f/tau; - - if(TID == j) - tauVect[j] = tau; + z = w * cornerTile[j*tilesize+TID]; + reduceSumMultiWarp(&z); + z += rowTile[j*tilesize+i]; + if(TID == 0) + rowTile[j*tilesize+i] = rowTile[j*tilesize+i] - tau*z; + cornerTile[j*tilesize+TID] = cornerTile[j*tilesize+TID] - tau*w*z; } - __syncthreads(); - /* Apply new vector to column. */ - for(k = j+group ; k < 32; k +=WARPS ) + /*for(j = 0; j < i; j++) + w[j] = 0.0; + w[i] = 1.0; + for(j = i+1; j < tilesize; j++) + w[j] = 0.0; + for(j = 0; j < tilesize; j++) + w[j+tilesize] = columnTile[i*tilesize +j];*/ + + /* Apply householder vector (w) to the tiles.*/ +/* for(j = 0; j < tilesize; j++) { - if(k != j) + z = 0.0;*/ + /* Compute w' * A_j */ +/* for(n = 0; n < tilesize; n++) { - if(tid <= k ) - topElem = blockA[tid + k*32]; - if( tid > k ) - topElem = 0.f; - - lowElem = blockB[tid + k*32]; + z += w[n] * rowTile[j*tilesize+n]; + z += w[n + tilesize] * cornerTile[j*tilesize+n]; } - applyOneHHVectD (&topElem, &lowElem, - j, - tauVect, - blockCache); - - if(tid <= k) - blockA[tid + k*32] = topElem; - - blockB[tid + k*32] = lowElem; - - // __syncthreads(); - } - - __syncthreads(); - /* Write back */ - //if(TID <= j) - // blockA[TID + (j*ldm)] = topElem; - } - - /* Write back lower block, containing householder Vectors. */ - if(TID < 32) - { - for(j = 0; j < 32; j ++) - blockB[TID + (j*32)] = blockCache[TID + (j*32)]; - - blockTau[TID] = tauVect[TID]; + for(n = 0; n < tilesize; n++) + { + rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z; + cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z; + } + }*/ } - __threadfence(); -} - -__device__ void runner_cuda_SSSRFT( int i, int j, int k, volatile float tau[], volatile float blockCache[] ) -{ - __shared__ volatile float workV[WARPS*32]; - __syncthreads(); - - float aelem, belem, - alpha; - float* blockV = &GPU_matrix[k*cuda_m*32*32 + i*32*32]; - float* blockA = &GPU_matrix[j*cuda_m*32*32 + k*32*32]; - float* blockB = &GPU_matrix[j*cuda_m*32*32 + i*32*32]; - float* blockTau = &GPU_tau[k*cuda_m*32 + i*32]; - - int tid, group, - refMat, refCache; - - tid = TID%32; - group = TID/32; - - refMat = tid + group*32; - refCache = tid + group*32; - - if(TID < 32) - tau[TID] = blockTau[TID]; - __syncthreads(); - - - /* Load the essential HH vector block into shared cache. */ - for(j = group; j < 32; j += WARPS) - { - blockCache[refCache] = blockV[refMat]; - - refMat += WARPS*32; - refCache += WARPS*32; - __syncthreads(); - } - - __syncthreads(); - - /* For each column of the result. */ - for(j = group; j < 32; j += WARPS) - { - //if(tid == 0) printf("%d: column %d.\n", group, j); - /* Load the elements of the column to process. */ - aelem = blockA[tid + (j*32)]; - belem = blockB[tid + (j*32)]; - - /* Compute and apply b_j = b_j - tau*vv'b_j - for each Householder vector 1..32. */ - for(k = 0; k < 32; k ++) - { - /* Load the kth element into all threads. */ - workV[tid + (group*32)] = aelem; - - /* Store this as alpha. */ - alpha = workV[k + (group*32)]; - - /* Load components of v'b_j to sum. */ - workV[tid + (group*32)] = belem * blockCache[tid + (k*32)]; - - if(tid < 16) - workV[tid + (group*32)] += workV[16 + tid + (group*32)]; - if(tid < 8) - workV[tid + (group*32)] += workV[8 + tid + (group*32)]; - /* Compute v'b_j */ - for(i = 0; i < 8; i ++) - alpha += workV[i + (group*32)]; - - /* Multiply by kth tau. */ - alpha *= tau[k]; - - /* Compute b_j -= alpha*v - If kth thread in group, v is 1 at aelem. */ - if(tid == k) aelem -= alpha; - - /* Compute b_j -= alpha * v for lower half. */ - belem -= alpha * blockCache[tid + (k*32)]; - __syncthreads(); - } - /* put the elements back. */ - blockA[tid + (j*32)] = aelem; - blockB[tid + (j*32)] = belem; - //__syncthreads(); - } - __syncthreads(); } @@ -759,7 +461,7 @@ __device__ void runner ( int type , void *data ) { switch ( type ) { case task_SGEQRF: if(threadIdx.x < 32) - runner_cuda_SGEQRF(i, j, k, workVector, blockCache); +// runner_cuda_SGEQRF(i, j, k, workVector, blockCache); if(threadIdx.x == 0){ printf("SGEQRF: %i %i %i + %i\n",i, j , k, clock()); @@ -1116,6 +818,188 @@ void test() free(Matrix); } +/* Unit tests for the tiled QR decomposition.*/ + +__device__ volatile float tile[32*32]; +__device__ volatile float tile2[32*32]; +__device__ volatile float tile3[32*32]; +__device__ volatile float tile4[32*32]; +__device__ volatile float tau[1024]; + +__global__ void SGEQRF_test(float* cornerTile) +{ + int i; + + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + { + printf("Copying value %i\n", i); + tile[i] = cornerTile[i]; + } + + printf("blockDim.x = %i\n", blockDim.x); + __syncthreads(); + if(threadIdx.x < 32) + SGEQRF(tile, 32, tau, 0, 1); + __syncthreads(); + __threadfence(); + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + cornerTile[i] = tile[i]; +} + +void test_SGEQRF() +{ + float* cornerTile; + cornerTile = generateColumnMatrix(32, 35532); + SGEQRF_test<<<1, 128>>>(cornerTile); + cudaDeviceSynchronize(); + printMatrix(cornerTile, 1, 1, 32); + cudaFreeHost(cornerTile); +} + + +__global__ void SLARFT_test(float* cornerTile, float* rowTile) +{ + int i; + + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile[i] = cornerTile[i]; + + __syncthreads(); + + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile2[i] = rowTile[i]; + __syncthreads(); + if(threadIdx.x < 32) + SGEQRF(tile, 32, tau, 0, 2); + + __syncthreads(); +// if(threadIdx.x < 32) + SLARFT(tile, tile2, 32, 1, 0, tau, 2); + __syncthreads(); + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + cornerTile[i] = tile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + rowTile[i] = tile2[i]; + __syncthreads(); +} + +void test_SLARFT() +{ + float* cornerTile, *rowTile; + cornerTile = generateColumnMatrix(32, 35536); + rowTile = generateColumnMatrix(32, 35539); + SLARFT_test<<<1, 128>>>(cornerTile, rowTile); + cudaDeviceSynchronize(); + printMatrix(cornerTile, 1, 1, 32); + + printf("\n\n\n\n\n Row Tile \n\n\n\n\n"); + printMatrix(rowTile, 1, 1, 32); + cudaFreeHost(cornerTile); + cudaFreeHost(rowTile); +} + + +__global__ void STSQRF_test(float* cornerTile, float* columnTile) +{ + int i; + + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile[i] = cornerTile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile2[i] = columnTile[i]; + __syncthreads(); + if(threadIdx.x < 32) + SGEQRF(tile, 32, tau, 0, 2); + __syncthreads(); + if(threadIdx.x < 32) + STSQRF( tile, tile2, 32, 1, 0, tau, 2); + __syncthreads(); + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + cornerTile[i] = tile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + columnTile[i] = tile2[i]; +} + +void test_STSQRF() +{ + float *cornerTile, *columnTile; + cornerTile = generateColumnMatrix(32, 35531); + columnTile = generateColumnMatrix(32, 35585); + STSQRF_test<<<1,32>>>(cornerTile, columnTile); + cudaDeviceSynchronize(); + printMatrix(cornerTile, 1, 1, 32); + printf("\n\n\n\n\n Column Tile \n\n\n\n\n"); + printMatrix(columnTile, 1, 1, 32); + cudaFreeHost(cornerTile); + cudaFreeHost(columnTile); +} + + + +__global__ void SSSRFT_test(float* cornerTile, float* columnTile, float* rowTile, float* otherTile) +{ + int i; + + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile[i] = cornerTile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile2[i] = columnTile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile3[i] = rowTile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + tile4[i] = otherTile[i]; + __syncthreads(); + if(threadIdx.x < 32) + SGEQRF(tile, 32, tau, 0, 2); + __syncthreads(); + if(threadIdx.x < 32) + STSQRF( tile, tile2, 32, 1, 0, tau, 2); + __syncthreads(); + SLARFT(tile, tile3, 32, 1, 0, tau, 2); + __syncthreads(); + SSSRFT( tile4, tile2, tile3, 32, 1, 1 , 0, tau, 2 ); + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + cornerTile[i] = tile[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + columnTile[i] = tile2[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + rowTile[i] = tile3[i]; + for(i = threadIdx.x; i < 32*32; i+=blockDim.x) + otherTile[i] = tile4[i]; +} + +void test_SSSRFT() +{ + float* cornerTile, *rowTile, *columnTile, *otherTile; + cornerTile = generateColumnMatrix(32, 35136); + rowTile = generateColumnMatrix(32, 35239); + columnTile = generateColumnMatrix(32, 35339); + otherTile = generateColumnMatrix(32, 35739); + SSSRFT_test<<<1, 128>>>(cornerTile, columnTile, rowTile, otherTile); + cudaDeviceSynchronize(); + printMatrix(cornerTile, 1, 1, 32); + printf("\n\n\n\n\n Column Tile \n\n\n\n\n"); + printMatrix(columnTile, 1, 1, 32); + printf("\n\n\n\n\n Row Tile \n\n\n\n\n"); + printMatrix(rowTile, 1, 1, 32); + printf("\n\n\n\n\n Other Tile \n\n\n\n\n"); + printMatrix(otherTile, 1, 1, 32); + cudaFreeHost(cornerTile); + cudaFreeHost(rowTile); + cudaFreeHost(columnTile); + cudaFreeHost(otherTile); + +} + + + +void runTests() +{ + // test_SGEQRF(); + // test_SLARFT(); + // test_STSQRF(); + test_SSSRFT(); +} /** * @brief Main function. */ @@ -1157,7 +1041,7 @@ int main ( int argc , char *argv[] ) { //omp_set_num_threads( nr_threads ); break; case 'D': - test(); + runTests(); exit( EXIT_SUCCESS ); case '?': fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );