diff --git a/examples/Makefile.am b/examples/Makefile.am
index 152221294d79d6d628c5dc5c5776bc9f2e309f28..9708f27ed737f34bc9bb425d4b6bb0529b8feb8a 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -23,23 +23,22 @@ AUTOMAKE_OPTIONS=gnu
AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \
-ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \
-DCPU_TPS=2.67e9 -DTIMERS \
- #-fsanitize=address -fno-omit-frame-pointer
+ # -fsanitize=address -fno-omit-frame-pointer
AM_LDFLAGS = -lm # -fsanitize=address
# Set-up the library
-bin_PROGRAMS = test test_bh test_bh_2 test_bh_3 test_bh_sorted test_qr
-#test_qr
+bin_PROGRAMS = test test_qr test_bh test_bh_2 test_bh_3 test_bh_sorted
+
# Sources for test
test_SOURCES = test.c
test_CFLAGS = $(AM_CFLAGS)
test_LDADD = ../src/.libs/libquicksched.a
# Sources for test_qr
-#-L/usr/bin64/atlas/ /home/aidan/lapack-3.5.0/liblapacke.a /home/aidan/lapack-3.5.0/liblapack.a
-test_qr_SOURCES = test_qr.c /usr/lib64/atlas/libcblas.a /usr/lib64/atlas/libptcblas.a
-test_qr_CFLAGS = $(AM_CFLAGS) -I/home/aidan/ATLAS/ATLAS_linux/include #-I/home/aidan/lapack-3.5.0/lapacke/include
-test_qr_LDADD = ../src/.libs/libquicksched.a -lf77blas -lcblas -latlas -lm -L/home/aidan/ATLAS/ATLAS_linux/lib/
+test_qr_SOURCES = test_qr.c
+test_qr_CFLAGS = $(AM_CFLAGS)
+test_qr_LDADD = ../src/.libs/libquicksched.a -llapacke -llapacke -lblas
# Sources for test_bh
test_bh_SOURCES = test_bh.c
diff --git a/examples/test_qr.c b/examples/test_qr.c
index ada33a33acf4833ed478e10bcd0f0cadf466a5e5..b9cdbecde7b6a2d430f0afbbe24015c88ba8a028 100644
--- a/examples/test_qr.c
+++ b/examples/test_qr.c
@@ -1,22 +1,21 @@
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- *
+ *
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-
+ *
+* *****************************************************************************/
/* Config parameters. */
#include "../config.h"
@@ -30,187 +29,161 @@
#include <omp.h>
#include <pthread.h>
-
-
-
#include <cblas.h>
-
/* Local includes. */
#include "quicksched.h"
/**
- * Takes a column major matrix, NOT tile major. size is length of a side of the matrix. Only works for square matrices.
- * This function is simply for validation and is implemented naively as we know of no implementation to retrieve Q from the tiled QR.
+ * Takes a column major matrix, NOT tile major. size is length of a side of the
+ * matrix. Only works for square matrices.
+ * This function is simply for validation and is implemented naively as we know
+ * of no implementation to retrieve Q from the tiled QR.
*/
-double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum)
-{
- double* Q = malloc(sizeof(double)*size*size);
- double* Qtemp = malloc(sizeof(double)*size*size);
- double* w = malloc(sizeof(double)*size);
- double* ww = malloc(sizeof(double)*size*size);
- double* temp = malloc(sizeof(double)*size*size);
- int i, k, l, j, n;
- bzero(Q, sizeof(double)*size*size);
- bzero(Qtemp, sizeof(double)*size*size);
- bzero(ww, sizeof(double)*size*size);
- for(i = 0; i < size; i++)
- {
- Q[i*size + i] = 1.0;
- }
- int numcoltile = size / tilesize;
- int numrowtile = size / tilesize;
- for(k = 0; k < numrowtile; k++)
- {
- for(l = 0; l < tilesize; l++)
- {
- bzero(Qtemp, sizeof(double)*size*size);
- for(i = 0; i < size; i++)
- {
- Qtemp[i*size + i] = 1.0;
- }
-
-
- for(i = k; i < numcoltile; i++)
- {
- bzero(w, sizeof(double)*size);
-
- for(j = 0 ; j < tilesize; j++)
- {
- w[i*tilesize + j] = HR[(k*tilesize+l)*size + i*tilesize+j];
- }
- w[k*tilesize+l] = 1.0;
- if(k*tilesize+l > i*tilesize)
- {
- for(j = 0; j < k*tilesize+l; j++)
- w[j] = 0.0;
- }
-
-
- /* Compute (I - tau*w*w')' */
- for(j = 0; j < size; j++)
- {
- for(n = 0; n < size; n++)
- {
- if(j != n)
- ww[n*size + j] = -tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
- else
- ww[n*size + j] = 1.0 - tau[(k*tilesize+l)*tauNum+ i] * w[j] * w[n];
- }
- }
-
- /* Qtemp = Qtemp * (I-tau*w*w')' */
- cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Qtemp, size, ww, size, 0.0, temp, size);
- double *b = Qtemp;
- Qtemp = temp;
- temp = b;
- }
- cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, 1.0, Q, size, Qtemp, size, 0.0, temp, size);
- double *b = Q;
- Q = temp;
- temp = b;
+double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
+ double* Q = malloc(sizeof(double) * size * size);
+ double* Qtemp = malloc(sizeof(double) * size * size);
+ double* w = malloc(sizeof(double) * size);
+ double* ww = malloc(sizeof(double) * size * size);
+ double* temp = malloc(sizeof(double) * size * size);
+ int i, k, l, j, n;
+ bzero(Q, sizeof(double) * size * size);
+ bzero(Qtemp, sizeof(double) * size * size);
+ bzero(ww, sizeof(double) * size * size);
+ for (i = 0; i < size; i++) {
+ Q[i * size + i] = 1.0;
+ }
+ int numcoltile = size / tilesize;
+ int numrowtile = size / tilesize;
+ for (k = 0; k < numrowtile; k++) {
+ for (l = 0; l < tilesize; l++) {
+ bzero(Qtemp, sizeof(double) * size * size);
+ for (i = 0; i < size; i++) {
+ Qtemp[i * size + i] = 1.0;
+ }
+
+ for (i = k; i < numcoltile; i++) {
+ bzero(w, sizeof(double) * size);
+
+ for (j = 0; j < tilesize; j++) {
+ w[i * tilesize + j] =
+ HR[(k * tilesize + l) * size + i * tilesize + j];
+ }
+ w[k * tilesize + l] = 1.0;
+ if (k * tilesize + l > i * tilesize) {
+ for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
}
+
+ /* Compute (I - tau*w*w')' */
+ for (j = 0; j < size; j++) {
+ for (n = 0; n < size; n++) {
+ if (j != n)
+ ww[n * size + j] =
+ -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+ else
+ ww[n * size + j] =
+ 1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
+ }
+ }
+
+ /* Qtemp = Qtemp * (I-tau*w*w')' */
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+ 1.0, Qtemp, size, ww, size, 0.0, temp, size);
+ double* b = Qtemp;
+ Qtemp = temp;
+ temp = b;
+ }
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
+ 1.0, Q, size, Qtemp, size, 0.0, temp, size);
+ double* b = Q;
+ Q = temp;
+ temp = b;
}
-
- free(Qtemp);
- free(w);
- free(ww);
- free(temp);
- return Q;
+ }
+
+ free(Qtemp);
+ free(w);
+ free(ww);
+ free(temp);
+ return Q;
}
-double* getR(double*HR, int size)
-{
- double* R = malloc(sizeof(double) * size * size);
- int i, j;
- bzero(R, sizeof(double) * size * size);
- for(i = 0; i< size; i++)
- {
- for(j = 0; j <= i; j++)
- {
- R[i*size + j] = HR[i*size + j];
- }
+double* getR(double* HR, int size) {
+ double* R = malloc(sizeof(double) * size * size);
+ int i, j;
+ bzero(R, sizeof(double) * size * size);
+ for (i = 0; i < size; i++) {
+ for (j = 0; j <= i; j++) {
+ R[i * size + j] = HR[i * size + j];
}
- return R;
+ }
+ return R;
}
-void printMatrix(double* Matrix, int m, int n, int tilesize)
-{
- int i, j;
-
- for(i=0; i < m*tilesize; i++)
- {
- for(j = 0; j < n*tilesize; j++)
- {
- printf(" %.3f ", Matrix[j*m*tilesize +i]);
+void printMatrix(double* Matrix, int m, int n, int tilesize) {
+ int i, j;
- }
- printf("\n");
+ for (i = 0; i < m * tilesize; i++) {
+ for (j = 0; j < n * tilesize; j++) {
+ printf(" %.3f ", Matrix[j * m * tilesize + i]);
}
-
+ printf("\n");
+ }
}
-double* columnToTile( double* columnMatrix, int size , int m , int n , int tilesize)
-{
- double* TileMatrix;
- TileMatrix = malloc(sizeof(double) * size );
- if(TileMatrix == NULL)
- error("failed to allocate TileMatrix");
- int i,j,k,l;
-
- for( i = 0; i < n ; i++ )
- {
- for( j = 0; j < m; j++ )
- {
- double *tileStart = &columnMatrix[i*m*tilesize*tilesize + j*tilesize];
- double *tilePos = &TileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
- for(k = 0; k < tilesize; k++)
- {
- tileStart = &columnMatrix[i*m*tilesize*tilesize+k*m*tilesize + j*tilesize];
- for( l = 0; l < tilesize; l++ )
- {
- tilePos[k*tilesize + l] = tileStart[l];
- }
- }
+double* columnToTile(double* columnMatrix, int size, int m, int n,
+ int tilesize) {
+ double* TileMatrix;
+ TileMatrix = malloc(sizeof(double) * size);
+ if (TileMatrix == NULL) error("failed to allocate TileMatrix");
+ int i, j, k, l;
+
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < m; j++) {
+ double* tileStart =
+ &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
+ double* tilePos =
+ &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+ for (k = 0; k < tilesize; k++) {
+ tileStart = &columnMatrix[i * m * tilesize * tilesize +
+ k * m * tilesize + j * tilesize];
+ for (l = 0; l < tilesize; l++) {
+ tilePos[k * tilesize + l] = tileStart[l];
}
+ }
}
+ }
- return TileMatrix;
-
+ return TileMatrix;
}
-double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize)
-{
- double* ColumnMatrix;
- ColumnMatrix = (double*) malloc(sizeof(double) * size );
- if(ColumnMatrix == NULL)
- error("failed to allocate ColumnMatrix");
- int i,j,k,l;
- for( i = 0; i < n ; i++ )
- {
- for(j = 0; j < m ; j++ )
- {
- /* Tile on ith column is at i*m*32*32.*/
- /* Tile on jth is at j*32*32 */
- double *tile = &tileMatrix[i*m*tilesize*tilesize + j*tilesize*tilesize];
- /* Column starts at same position as tile. */
- /* Row j*32.*/
- double *tilePos = &ColumnMatrix[i*m*tilesize*tilesize + j*tilesize];
- for( k = 0; k < tilesize; k++ )
- {
- for(l=0; l < tilesize; l++)
- {
- tilePos[l] = tile[l];
- }
- /* Next 32 elements are the position of the tile in the next column.*/
- tile = &tile[tilesize];
- /* Move to the j*32th position in the next column. */
- tilePos = &tilePos[tilesize*m];
-
- }
- }
+double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
+ double* ColumnMatrix;
+ ColumnMatrix = (double*)malloc(sizeof(double) * size);
+ if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+ int i, j, k, l;
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < m; j++) {
+ /* Tile on ith column is at i*m*32*32.*/
+ /* Tile on jth is at j*32*32 */
+ double* tile =
+ &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+ /* Column starts at same position as tile. */
+ /* Row j*32.*/
+ double* tilePos =
+ &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+ for (k = 0; k < tilesize; k++) {
+ for (l = 0; l < tilesize; l++) {
+ tilePos[l] = tile[l];
+ }
+ /* Next 32 elements are the position of the tile in the next column.*/
+ tile = &tile[tilesize];
+ /* Move to the j*32th position in the next column. */
+ tilePos = &tilePos[tilesize * m];
+ }
}
- return ColumnMatrix;
+ }
+ return ColumnMatrix;
}
/* Routines for the tiled QR decomposition.*/
@@ -219,89 +192,78 @@ double* tileToColumn( double* tileMatrix, int size, int m , int n , int tilesize
*
* @brief Computes the QR decomposition of a tile.
*
- * @param cornerTile A pointer to the tile for which the decomposition is computed.
+ * @param cornerTile A pointer to the tile for which the decomposition is
+* computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param k The value of k for the QR decomposition
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
*
*
*/
-void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatrix, int k, int tauNum, double* w)
-{
- int i, j, n;
- double norm=0.0, sign, u1, tau, z;
-
-
- /*Find the householder vector for each row. */
- for(i = 0; i < tileSize; i++)
- {
- norm = 0.0;
- /*Fill w with the vector.*/
- for(j=i; j < tileSize; j++)
- {
- /* ith row is i*tileSize, only want elements on diagonal or below.*/
- w[ j ] = cornerTile[ i*tileSize +j ];
- /*Find the norm as well */
- norm = norm + w[j]*w[j];
- }
- if(w[i] >= 0.0)
- sign = -1;
- else
- sign = 1;
+void DGEQRF(double* restrict cornerTile, int tileSize,
+ double* restrict tauMatrix, int k, int tauNum) {
+ int i, j, n;
+ double norm = 0.0, sign, u1, tau, z;
+ double w[tileSize];
+
+ /*Find the householder vector for each row. */
+ for (i = 0; i < tileSize; i++) {
+ norm = 0.0;
+ /*Fill w with the vector.*/
+ for (j = i; j < tileSize; j++) {
+ /* ith row is i*tileSize, only want elements on diagonal or below.*/
+ w[j] = cornerTile[i * tileSize + j];
+ /*Find the norm as well */
+ norm = norm + w[j] * w[j];
+ }
+ if (w[i] >= 0.0)
+ sign = -1;
+ else
+ sign = 1;
- norm = sqrt(norm);
+ norm = sqrt(norm);
- u1 = w[i] - sign*norm;
+ u1 = w[i] - sign * norm;
- if(u1 != 0.0)
- {
- for(j=i+1; j < tileSize; j++)
- w[j] = w[j] / u1;
- }
- else
- {
- for(j=i+1; j < tileSize; j++)
- w[j] = 0.0;
- }
+ if (u1 != 0.0) {
+ for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
+ } else {
+ for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
+ }
- if(norm != 0.0)
- tau = -sign*u1/norm;
- else
- tau = 0.0;
+ if (norm != 0.0)
+ tau = -sign * u1 / norm;
+ else
+ tau = 0.0;
- /*Store the below diagonal vector */
-
- for(j = i+1; j < tileSize; j++)
- {
- cornerTile[ i*tileSize +j ] = w[j];
- }
- cornerTile[ i*tileSize + i ] = sign*norm;
- w[i] = 1.0;
- /* Apply the householder transformation to the rest of the tile, for everything to the right of the diagonal. */
- for(j = i+1; j < tileSize; j++)
- {
- /*Compute w'*A_j*/
- z = cornerTile[ j*tileSize+i];
- for(n = i+1; n < tileSize; n++)
- {
- z = z + cornerTile[j*tileSize+n] * w[n];
- }
- /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
- for(n=i; n < tileSize; n++)
- {
- cornerTile[j*tileSize+n] = cornerTile[j*tileSize+n] - tau*w[n]*z;
- }
-
- }
- /* Store tau. We're on k*tileSize+ith row. kth column.*/
- tauMatrix[(k*tileSize+i)*tauNum+k] = tau;
+ /*Store the below diagonal vector */
+ for (j = i + 1; j < tileSize; j++) {
+ cornerTile[i * tileSize + j] = w[j];
}
-
+ cornerTile[i * tileSize + i] = sign * norm;
+ w[i] = 1.0;
+ /* Apply the householder transformation to the rest of the tile, for
+ * everything to the right of the diagonal. */
+ for (j = i + 1; j < tileSize; j++) {
+ /*Compute w'*A_j*/
+ z = cornerTile[j * tileSize + i];
+ for (n = i + 1; n < tileSize; n++) {
+ z = z + cornerTile[j * tileSize + n] * w[n];
+ }
+ /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+ for (n = i; n < tileSize; n++) {
+ cornerTile[j * tileSize + n] =
+ cornerTile[j * tileSize + n] - tau * w[n] * z;
+ }
+ }
+ /* Store tau. We're on k*tileSize+ith row. kth column.*/
+ tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
+ }
}
-
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
@@ -312,46 +274,40 @@ void DGEQRF(double* restrict cornerTile, int tileSize, double* restrict tauMatri
* @param tauMatrix A pointer to the tau Matrix.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
*
*
*/
-void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, int jj, int kk, double* restrict tauMatrix, int tauNum, double* w)
-{
- int i, j, n;
- double z=0.0;
-
-
- /* For each row in the corner Tile*/
- for(i = 0; i < tileSize; i++)
- {
- /*Get w for row i */
- for(j = i; j < tileSize; j++)
- {
- w[j] = cornerTile[i*tileSize + j];
- }
- w[i] = 1.0;
-
- /* Apply to the row Tile */
- for(j = 0; j < tileSize; j++)
- {
- z=0.0;
- /* Below Diagonal!*/
- /*Compute w'*A_j*/
- for(n = i; n < tileSize; n++)
- {
- z = z + w[n] * rowTile[j*tileSize+n];
- }
- for(n = i; n < tileSize; n++)
- {
- rowTile[j*tileSize+n] = rowTile[j*tileSize+n] - tauMatrix[(kk*tileSize+i)*tauNum+kk]*w[n]*z;
- }
- }
-
-
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
+ int jj, int kk, double* restrict tauMatrix, int tauNum) {
+ int i, j, n;
+ double z = 0.0;
+ double w[tileSize];
+
+ /* For each row in the corner Tile*/
+ for (i = 0; i < tileSize; i++) {
+ /*Get w for row i */
+ for (j = i; j < tileSize; j++) {
+ w[j] = cornerTile[i * tileSize + j];
}
-
-
+ w[i] = 1.0;
+
+ /* Apply to the row Tile */
+ for (j = 0; j < tileSize; j++) {
+ z = 0.0;
+ /* Below Diagonal!*/
+ /*Compute w'*A_j*/
+ for (n = i; n < tileSize; n++) {
+ z = z + w[n] * rowTile[j * tileSize + n];
+ }
+ for (n = i; n < tileSize; n++) {
+ rowTile[j * tileSize + n] =
+ rowTile[j * tileSize + n] -
+ tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
+ }
+ }
+ }
}
/**
@@ -364,83 +320,77 @@ void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
* @param tauMatrix A pointer to the tau Matrix.
* @param ii The value of i for the QR decomposition.
* @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
*
*
*/
-void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tilesize, int ii, int kk, double* restrict tauMatrix, int tauNum, double* w )
-{
- int i, j, n;
- double norm=0.0, sign, u1, tau, z;
-
- /* For each column compute the householder vector. */
- for(i = 0; i < tilesize; i++)
- {
- norm = 0.0;
- w[i] = cornerTile[ i*tilesize+i ];
- norm = norm + w[i]*w[i];
- for(j = i+1; j < tilesize; j++)
- {
- w[j] = 0.0;
- }
- for(j = 0; j < tilesize; j++)
- {
- w[tilesize+j] = columnTile[ i*tilesize+j ];
- norm = norm + w[tilesize+j]*w[tilesize+j];
- }
-
- norm = sqrt(norm);
- if(w[i] >= 0.0)
- sign = -1;
- else
- sign = 1;
-
-
- u1 = w[i] - sign*norm;
- if(u1 != 0.0)
- {
- for(j = i+1; j < 2*tilesize; j++){
- w[j] = w[j]/u1;
- }
- }else
- {
- for(j = i+1; j < 2*tilesize; j++)
- w[j] = 0.0;
- }
+void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
+ int tilesize, int ii, int kk, double* restrict tauMatrix,
+ int tauNum) {
+ int i, j, n;
+ double norm = 0.0, sign, u1, tau, z;
+ double w[2*tilesize];
+
+ /* For each column compute the householder vector. */
+ for (i = 0; i < tilesize; i++) {
+ norm = 0.0;
+ w[i] = cornerTile[i * tilesize + i];
+ norm = norm + w[i] * w[i];
+ for (j = i + 1; j < tilesize; j++) {
+ w[j] = 0.0;
+ }
+ for (j = 0; j < tilesize; j++) {
+ w[tilesize + j] = columnTile[i * tilesize + j];
+ norm = norm + w[tilesize + j] * w[tilesize + j];
+ }
- if(norm != 0)
- tau = -sign*u1/norm;
- else
- tau = 0.0;
-
- /* Apply to each row to the right.*/
- for(j = i; j < tilesize; j++)
- {
- /* Find w'*A_j, w is 0s except for first value with upper tile.*/
- z = 1.0 * cornerTile[ j*tilesize+i ];
- for(n = 0; n < tilesize; n++)
- {
- z = z + w[ tilesize+n ]*columnTile[ j*tilesize+n ];
- }
- /* Apply to upper tile.*/
- cornerTile[j*tilesize+i] = cornerTile[j*tilesize+i ] - tau*1.0*z;
- for(n = i+1; n < tilesize; n++)
- {
- cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n ] - tau*w[n]*z;
- }
- /* Apply to lower tile.*/
- for(n = 0; n < tilesize; n++)
- {
- columnTile[ j*tilesize+n] = columnTile[ j*tilesize+n ] - tau*w[tilesize+n]*z;
- }
-
- }
- /* Store w*/
- for(j = 0; j < tilesize; j++){
- columnTile[ i*tilesize+j ] = w[tilesize+j];
- }
- tauMatrix[(kk*tilesize+i)*tauNum+ ii] = tau;
+ norm = sqrt(norm);
+ if (w[i] >= 0.0)
+ sign = -1;
+ else
+ sign = 1;
+
+ u1 = w[i] - sign * norm;
+ if (u1 != 0.0) {
+ for (j = i + 1; j < 2 * tilesize; j++) {
+ w[j] = w[j] / u1;
+ }
+ } else {
+ for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
}
+
+ if (norm != 0)
+ tau = -sign * u1 / norm;
+ else
+ tau = 0.0;
+
+ /* Apply to each row to the right.*/
+ for (j = i; j < tilesize; j++) {
+ /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+ z = 1.0 * cornerTile[j * tilesize + i];
+ for (n = 0; n < tilesize; n++) {
+ z = z + w[tilesize + n] * columnTile[j * tilesize + n];
+ }
+ /* Apply to upper tile.*/
+ cornerTile[j * tilesize + i] =
+ cornerTile[j * tilesize + i] - tau * 1.0 * z;
+ for (n = i + 1; n < tilesize; n++) {
+ cornerTile[j * tilesize + n] =
+ cornerTile[j * tilesize + n] - tau * w[n] * z;
+ }
+ /* Apply to lower tile.*/
+ for (n = 0; n < tilesize; n++) {
+ columnTile[j * tilesize + n] =
+ columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
+ }
+ }
+ /* Store w*/
+ for (j = 0; j < tilesize; j++) {
+ columnTile[i * tilesize + j] = w[tilesize + j];
+ }
+ tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
+ }
}
/**
@@ -455,46 +405,45 @@ void DTSQRF( double* restrict cornerTile, double* restrict columnTile, int tiles
* @param ii The value of i for the QR decomposition.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
- * @param tauNum The number of tau values stored for each row of the matrix (this is equal to the number of tiles on each column).
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
*
*
*/
-void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* restrict rowTile, int tilesize, int ii, int jj, int kk, double* restrict tauMatrix, int tauNum , double* w)
-{
- int i, j, n;
- double z;
-
-
- for(i = 0; i < tilesize; i++)
- {
- for(j = 0; j < i; j++)
- w[j] = 0.0;
- w[i] = 1.0;
- for(j = i+1; j < tilesize; j++)
- w[j] = 0.0;
- for(j = 0; j < tilesize; j++)
- w[j+tilesize] = columnTile[i*tilesize +j];
-
- /* Apply householder vector (w) to the tiles.*/
- for(j = 0; j < tilesize; j++)
- {
- z = 0.0;
- /* Compute w' * A_j */
- for(n = 0; n < tilesize; n++)
- {
- z += w[n] * rowTile[j*tilesize+n];
- z += w[n + tilesize] * cornerTile[j*tilesize+n];
- }
- for(n = 0; n < tilesize; n++)
- {
- rowTile[j*tilesize + n] = rowTile[j*tilesize + n] - tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[n]*z;
- cornerTile[j*tilesize+n] = cornerTile[j*tilesize+n]- tauMatrix[(kk*tilesize+i)*tauNum+ii]*w[tilesize+n]*z;
- }
- }
+void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
+ double* restrict rowTile, int tilesize, int ii, int jj, int kk,
+ double* restrict tauMatrix, int tauNum) {
+ int i, j, n;
+ double z;
+ double w[2*tilesize];
+
+ for (i = 0; i < tilesize; i++) {
+ for (j = 0; j < i; j++) w[j] = 0.0;
+ w[i] = 1.0;
+ for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
+ for (j = 0; j < tilesize; j++)
+ w[j + tilesize] = columnTile[i * tilesize + j];
+
+ /* Apply householder vector (w) to the tiles.*/
+ for (j = 0; j < tilesize; j++) {
+ z = 0.0;
+ /* Compute w' * A_j */
+ for (n = 0; n < tilesize; n++) {
+ z += w[n] * rowTile[j * tilesize + n];
+ z += w[n + tilesize] * cornerTile[j * tilesize + n];
+ }
+ for (n = 0; n < tilesize; n++) {
+ rowTile[j * tilesize + n] =
+ rowTile[j * tilesize + n] -
+ tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
+ cornerTile[j * tilesize + n] =
+ cornerTile[j * tilesize + n] -
+ tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
+ }
}
+ }
+}
-}
-
/**
* @brief Computed a tiled QR factorization using QuickSched.
*
@@ -502,302 +451,298 @@ void DSSRFT( double* restrict cornerTile, double* restrict columnTile, double* r
* @param n Number of tile columns.
* @param nr_threads Number of threads to use.
*/
-
-void test_qr ( int m , int n , int K , int nr_threads , int runs, double *matrix ) {
-
- int k, j, i;
- double *A, *A_orig, *tau;
- struct qsched s;
- qsched_task_t *tid, tid_new;
- qsched_res_t *rid;
- int data[3];
- ticks tic, toc_run, tot_setup, tot_run = 0;
-
- enum task_types { task_DGEQRF , task_DLARFT , task_DTSQRF , task_DSSRFT };
-
-
- /* Runner function to pass to the scheduler. */
- void runner ( int type , void *data ) {
-
- /* Decode the task data. */
- int *idata = (int *)data;
- int i = idata[0], j = idata[1], k = idata[2];
- double ww[ 2*K ];
-
- /* Decode and execute the task. */
- switch ( type ) {
- case task_DGEQRF:
- DGEQRF( &A[ (k*m+k)*K*K], K, tau, k, m, ww);
- break;
- case task_DLARFT:
- DLARFT( &A[ (k*m+k)*K*K ],&A[(j*m+k)*K*K], K, j, k, tau, m, ww);
- break;
- case task_DTSQRF:
- DTSQRF( &A[(k*m+k)*K*K], &A[(k*m+i)*K*K], K, i, k, tau, m , ww);
- break;
- case task_DSSRFT:
- DSSRFT(&A[(j*m+i)*K*K], &A[(k*m+i)*K*K], &A[(j*m+k)*K*K], K, i, j, k, tau, m , ww);
- break;
- default:
- error( "Unknown task type." );
- }
- }
-
-
- /* Allocate and fill the original matrix. */
- if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ||
- ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL ||
- ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
- error( "Failed to allocate matrices." );
- for ( k = 0 ; k < m * n * K * K ; k++ )
- {
- if(matrix == NULL)
- A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
- else
- A_orig[k] = matrix[k];
+void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
+
+ int k, j, i;
+ double* A, *A_orig, *tau;
+ struct qsched s;
+ qsched_task_t* tid, tid_new;
+ qsched_res_t* rid;
+ int data[3];
+ ticks tic, toc_run, tot_setup, tot_run = 0;
+
+ enum task_types {
+ task_DGEQRF,
+ task_DLARFT,
+ task_DTSQRF,
+ task_DSSRFT
+ };
+
+ /* Runner function to pass to the scheduler. */
+ void runner(int type, void * data) {
+
+ /* Decode the task data. */
+ int* idata = (int*)data;
+ int i = idata[0], j = idata[1], k = idata[2];
+
+ /* Decode and execute the task. */
+ switch (type) {
+ case task_DGEQRF:
+ DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m);
+ break;
+ case task_DLARFT:
+ DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau,
+ m);
+ break;
+ case task_DTSQRF:
+ DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau,
+ m);
+ break;
+ case task_DSSRFT:
+ DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K],
+ &A[(j * m + k) * K * K], K, i, j, k, tau, m);
+ break;
+ default:
+ error("Unknown task type.");
}
- memcpy( A , A_orig , sizeof(double) * m * n * K * K );
- bzero( tau , sizeof(double) * m * n * K );
-
- /* Dump A_orig. */
- /* message( "A_orig = [" );
- for ( k = 0 ; k < m*K ; k++ ) {
- for ( j = 0 ; j < n*K ; j++ )
- printf( "%.3f " , A_orig[ j*m*K + k ] );
- printf( "\n" );
- }
- printf( "];\n" ); */
-
- /* Initialize the scheduler. */
- qsched_init( &s , nr_threads , qsched_flag_none );
-
- /* Allocate and init the task ID and resource ID matrix. */
- tic = getticks();
- if ( ( tid = (qsched_task_t *)malloc( sizeof(qsched_task_t) * m * n ) ) == NULL ||
- ( rid = (qsched_res_t *)malloc( sizeof(qsched_res_t) * m * n ) ) == NULL )
- error( "Failed to allocate tid/rid matrix." );
- for ( k = 0 ; k < m * n ; k++ ) {
- tid[k] = qsched_task_none;
- rid[k] = qsched_addres( &s , qsched_owner_none , qsched_res_none );
- }
-
- /* Build the tasks. */
- for ( k = 0 ; k < m && k < n ; k++ ) {
-
- /* Add kth corner task. */
- data[0] = k; data[1] = k; data[2] = k;
- tid_new = qsched_addtask( &s , task_DGEQRF , task_flag_none , data , sizeof(int)*3 , 2 );
- qsched_addlock( &s , tid_new , rid[ k*m + k ] );
- if ( tid[ k*m + k ] != -1 )
- qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
- tid[ k*m + k ] = tid_new;
-
- /* Add column tasks on kth row. */
- for ( j = k+1 ; j < n ; j++ ) {
- data[0] = k; data[1] = j; data[2] = k;
- tid_new = qsched_addtask( &s , task_DLARFT , task_flag_none , data , sizeof(int)*3 , 3 );
- qsched_addlock( &s , tid_new , rid[ j*m + k ] );
- qsched_adduse( &s , tid_new , rid[ k*m + k ] );
- qsched_addunlock( &s , tid[ k*m + k ] , tid_new );
- if ( tid[ j*m + k ] != -1 )
- qsched_addunlock( &s , tid[ j*m + k ] , tid_new );
- tid[ j*m + k ] = tid_new;
- }
-
- /* For each following row... */
- for ( i = k+1 ; i < m ; i++ ) {
-
- /* Add the row taks for the kth column. */
- data[0] = i; data[1] = k; data[2] = k;
- tid_new = qsched_addtask( &s , task_DTSQRF , task_flag_none , data , sizeof(int)*3 , 3 );
- qsched_addlock( &s , tid_new , rid[ k*m + i ] );
- qsched_adduse( &s , tid_new , rid[ k*m + k ] );
- qsched_addunlock( &s , tid[ k*m + (i-1) ] , tid_new );
- if ( tid[ k*m + i ] != -1 )
- qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
- tid[ k*m + i ] = tid_new;
-
- /* Add the inner tasks. */
- for ( j = k+1 ; j < n ; j++ ) {
- data[0] = i; data[1] = j; data[2] = k;
- tid_new = qsched_addtask( &s , task_DSSRFT , task_flag_none , data , sizeof(int)*3 , 5 );
- qsched_addlock( &s , tid_new , rid[ j*m + i ] );
- qsched_adduse( &s , tid_new , rid[ k*m + i ] );
- qsched_adduse( &s , tid_new , rid[ j*m + k ] );
- qsched_addunlock( &s , tid[ k*m + i ] , tid_new );
- qsched_addunlock( &s, tid[j*m+i-1], tid_new);
- if ( tid[ j*m + i ] != -1 )
- qsched_addunlock( &s , tid[ j*m + i ] , tid_new );
-
- tid[ j*m + i ] = tid_new;
- }
-
- }
-
- } /* build the tasks. */
- tot_setup = getticks() - tic;
-
- /* Dump the number of tasks. */
- message( "total nr of tasks: %i." , s.count );
- message( "total nr of deps: %i." , s.count_deps );
- message( "total nr of res: %i." , s.count_res );
- message( "total nr of locks: %i." , s.count_locks );
- message( "total nr of uses: %i." , s.count_uses );
-
-
- /* Loop over the number of runs. */
- for ( k = 0 ; k < runs ; k++ ) {
-
- /* Execute the the tasks. */
- tic = getticks();
- qsched_run( &s , nr_threads , runner );
- toc_run = getticks();
- message( "%ith run took %lli ticks..." , k , toc_run - tic );
- tot_run += toc_run - tic;
-
- }
-
-
- /* Dump A. */
- /* message( "A = [" );
- for ( k = 0 ; k < m*K ; k++ ) {
- for ( j = 0 ; j < n*K ; j++ )
- printf( "%.3f " , A[ j*m*K + k ] );
- printf( "\n" );
- }
- printf( "];\n" ); */
-
- /* Dump tau. */
- /* message( "tau = [" );
- for ( k = 0 ; k < m*K ; k++ ) {
- for ( j = 0 ; j < n ; j++ )
- printf( "%.3f " , tau[ j*m*K + k ] );
- printf( "\n" );
- }
- printf( "];\n" ); */
-
- /* Dump the tasks. */
- /* for ( k = 0 ; k < s.count ; k++ ) {
- int *d = (int *)&s.data[ s.tasks[k].data ];
- printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
- } */
-
- /* Dump the costs. */
- message( "costs: setup=%lli ticks, run=%lli ticks." ,
- tot_setup , tot_run/runs );
-
- /* Dump the timers. */
- for ( k = 0 ; k < qsched_timer_count ; k++ )
- message( "timer %s is %lli ticks." , qsched_timer_names[k] , s.timers[k]/runs );
-
- if(matrix != NULL)
- {
- for(k = 0; k < m*n*K*K; k++)
- matrix[k] = A[k];
+ }
+
+ /* Allocate and fill the original matrix. */
+ if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
+ (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
+ (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
+ error("Failed to allocate matrices.");
+ for (k = 0; k < m * n * K * K; k++) {
+ if (matrix == NULL)
+ A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
+ else
+ A_orig[k] = matrix[k];
+ }
+ memcpy(A, A_orig, sizeof(double) * m * n * K * K);
+ bzero(tau, sizeof(double) * m * n * K);
+
+ /* Dump A_orig. */
+ /* message( "A_orig = [" );
+ for ( k = 0 ; k < m*K ; k++ ) {
+ for ( j = 0 ; j < n*K ; j++ )
+ printf( "%.3f " , A_orig[ j*m*K + k ] );
+ printf( "\n" );
+ }
+ printf( "];\n" ); */
+
+ /* Initialize the scheduler. */
+ qsched_init(&s, nr_threads, qsched_flag_none);
+
+ /* Allocate and init the task ID and resource ID matrix. */
+ tic = getticks();
+ if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL ||
+ (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL)
+ error("Failed to allocate tid/rid matrix.");
+ for (k = 0; k < m * n; k++) {
+ tid[k] = qsched_task_none;
+ rid[k] = qsched_addres(&s, qsched_owner_none, qsched_res_none);
+ }
+
+ /* Build the tasks. */
+ for (k = 0; k < m && k < n; k++) {
+
+ /* Add kth corner task. */
+ data[0] = k;
+ data[1] = k;
+ data[2] = k;
+ tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, data,
+ sizeof(int) * 3, 2);
+ qsched_addlock(&s, tid_new, rid[k * m + k]);
+ if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
+ tid[k * m + k] = tid_new;
+
+ /* Add column tasks on kth row. */
+ for (j = k + 1; j < n; j++) {
+ data[0] = k;
+ data[1] = j;
+ data[2] = k;
+ tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, data,
+ sizeof(int) * 3, 3);
+ qsched_addlock(&s, tid_new, rid[j * m + k]);
+ qsched_adduse(&s, tid_new, rid[k * m + k]);
+ qsched_addunlock(&s, tid[k * m + k], tid_new);
+ if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
+ tid[j * m + k] = tid_new;
}
-
- /* Test if the decomposition was correct.*/
- /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
- double *Q = computeQ(tempMatrix, m*K, K, tau, m);
- double *R = getR(tempMatrix, m*K);
- cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q, m*K, R, m*K, 0.0, tempMatrix, m*K);
- free(Q);
- Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
- for(i = 0; i < m * n * K * K; i++)
- {
- if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] < 0.995))
- printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i], tempMatrix[i]);
- }
- free(tempMatrix);
- free(Q);
- free(R);*/
-
- /* Clean up. */
- free(A);
- free(A_orig);
- free(tau);
- free(tid);
- free(rid);
- qsched_free( &s );
-
+ /* For each following row... */
+ for (i = k + 1; i < m; i++) {
+
+ /* Add the row taks for the kth column. */
+ data[0] = i;
+ data[1] = k;
+ data[2] = k;
+ tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, data,
+ sizeof(int) * 3, 3);
+ qsched_addlock(&s, tid_new, rid[k * m + i]);
+ qsched_adduse(&s, tid_new, rid[k * m + k]);
+ qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
+ if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
+ tid[k * m + i] = tid_new;
+
+ /* Add the inner tasks. */
+ for (j = k + 1; j < n; j++) {
+ data[0] = i;
+ data[1] = j;
+ data[2] = k;
+ tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, data,
+ sizeof(int) * 3, 5);
+ qsched_addlock(&s, tid_new, rid[j * m + i]);
+ qsched_adduse(&s, tid_new, rid[k * m + i]);
+ qsched_adduse(&s, tid_new, rid[j * m + k]);
+ // qsched_addunlock(&s, tid[k * m + i], tid_new);
+ qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
+ if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
+
+ tid[j * m + i] = tid_new;
+ }
}
+ } /* build the tasks. */
+ tot_setup = getticks() - tic;
-/* Generates a random matrix. */
-double* generateColumnMatrix(int size)
-{
- double* matrix = malloc(sizeof(double)*size*size);
- if(matrix == NULL)
- error("Failed to allocate matrix");
-
- unsigned long int m_z = 35532;
- int i;
- for(i = 0 ; i < size*size; i++)
- {
- m_z = (1664525*m_z + 1013904223) % 4294967296;
- matrix[i] = m_z % 100;
- if(matrix[i] < 0)
- matrix[i] += 100;
- }
- return matrix;
+ /* Dump the number of tasks. */
+ message("total nr of tasks: %i.", s.count);
+ message("total nr of deps: %i.", s.count_deps);
+ message("total nr of res: %i.", s.count_res);
+ message("total nr of locks: %i.", s.count_locks);
+ message("total nr of uses: %i.", s.count_uses);
+
+ /* Loop over the number of runs. */
+ for (k = 0; k < runs; k++) {
+
+ /* Execute the the tasks. */
+ tic = getticks();
+ qsched_run(&s, nr_threads, runner);
+ toc_run = getticks();
+ message("%ith run took %lli ticks...", k, toc_run - tic);
+ tot_run += toc_run - tic;
+ }
+
+ /* Dump A. */
+ /* message( "A = [" );
+ for ( k = 0 ; k < m*K ; k++ ) {
+ for ( j = 0 ; j < n*K ; j++ )
+ printf( "%.3f " , A[ j*m*K + k ] );
+ printf( "\n" );
+ }
+ printf( "];\n" ); */
+
+ /* Dump tau. */
+ /* message( "tau = [" );
+ for ( k = 0 ; k < m*K ; k++ ) {
+ for ( j = 0 ; j < n ; j++ )
+ printf( "%.3f " , tau[ j*m*K + k ] );
+ printf( "\n" );
+ }
+ printf( "];\n" ); */
+
+ /* Dump the tasks. */
+ /* for ( k = 0 ; k < s.count ; k++ ) {
+ int *d = (int *)&s.data[ s.tasks[k].data ];
+ printf( " %i %i %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid ,
+ d[0] , d[1] , s.tasks[k].tic , s.tasks[k].toc );
+ } */
+
+ /* Dump the costs. */
+ message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup,
+ tot_run / runs);
+
+ /* Dump the timers. */
+ for (k = 0; k < qsched_timer_count; k++)
+ message("timer %s is %lli ticks.", qsched_timer_names[k],
+ s.timers[k] / runs);
+
+ if (matrix != NULL) {
+ for (k = 0; k < m * n * K * K; k++) matrix[k] = A[k];
+ }
+
+ /* Test if the decomposition was correct.*/
+ /*double *tempMatrix = tileToColumn(A, m*n*K*K, m, n, K);
+ double *Q = computeQ(tempMatrix, m*K, K, tau, m);
+ double *R = getR(tempMatrix, m*K);
+ cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, m*K, m*K, m*K, 1.0, Q,
+ m*K, R, m*K, 0.0, tempMatrix, m*K);
+ free(Q);
+ Q = tileToColumn(A_orig, m*n*K*K, m, n, K);
+ for(i = 0; i < m * n * K * K; i++)
+ {
+ if(Q[i] != 0 && (Q[i] / tempMatrix[i] > 1.005 || Q[i] / tempMatrix[i] <
+ 0.995))
+ printf("Not correct at value %i %.3f %.3e %.3e\n", i, A[i], Q[i],
+ tempMatrix[i]);
+ }
+ free(tempMatrix);
+ free(Q);
+ free(R);*/
+
+ /* Clean up. */
+ free(A);
+ free(A_orig);
+ free(tau);
+ free(tid);
+ free(rid);
+ qsched_free(&s);
}
+/* Generates a random matrix. */
+double* generateColumnMatrix(int size) {
+ double* matrix = malloc(sizeof(double) * size * size);
+ if (matrix == NULL) error("Failed to allocate matrix");
+
+ unsigned long int m_z = 35532;
+ int i;
+ for (i = 0; i < size * size; i++) {
+ m_z = (1664525 * m_z + 1013904223) % 4294967296;
+ matrix[i] = m_z % 100;
+ if (matrix[i] < 0) matrix[i] += 100;
+ }
+ return matrix;
+}
/**
* @brief Main function.
*/
-
-int main ( int argc , char *argv[] ) {
-
- int c, nr_threads;
- int M = 4, N = 4, runs = 1, K = 32;
-
- /* Get the number of threads. */
- #pragma omp parallel shared(nr_threads)
- {
- if ( omp_get_thread_num() == 0 )
- nr_threads = omp_get_num_threads();
- }
-
- /* Parse the options */
- while ( ( c = getopt( argc , argv , "m:n:k:r:t:" ) ) != -1 )
- switch( c ) {
- case 'm':
- if ( sscanf( optarg , "%d" , &M ) != 1 )
- error( "Error parsing dimension M." );
- break;
- case 'n':
- if ( sscanf( optarg , "%d" , &N ) != 1 )
- error( "Error parsing dimension M." );
- break;
- case 'k':
- if ( sscanf( optarg , "%d" , &K ) != 1 )
- error( "Error parsing tile size." );
- break;
- case 'r':
- if ( sscanf( optarg , "%d" , &runs ) != 1 )
- error( "Error parsing number of runs." );
- break;
- case 't':
- if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
- error( "Error parsing number of threads." );
- omp_set_num_threads( nr_threads );
- break;
- case '?':
- fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
- fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
- "matrix using nr_threads threads.\n" );
- exit( EXIT_FAILURE );
- }
-
- /* Dump arguments. */
- message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
- 32*M , 32*N , nr_threads , runs );
-
- test_qr( M , N , K , nr_threads , runs , NULL);
-
+
+int main(int argc, char* argv[]) {
+
+ int c, nr_threads;
+ int M = 4, N = 4, runs = 1, K = 32;
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+ {
+ if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+ }
+
+ /* Parse the options */
+ while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
+ case 'm':
+ if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
+ break;
+ case 'n':
+ if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
+ break;
+ case 'k':
+ if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
+ break;
+ case 'r':
+ if (sscanf(optarg, "%d", &runs) != 1)
+ error("Error parsing number of runs.");
+ break;
+ case 't':
+ if (sscanf(optarg, "%d", &nr_threads) != 1)
+ error("Error parsing number of threads.");
+ omp_set_num_threads(nr_threads);
+ break;
+ case '?':
+ fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
+ argv[0]);
+ fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
+ "matrix using nr_threads threads.\n");
+ exit(EXIT_FAILURE);
}
-
-
+
+ /* Dump arguments. */
+ message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
+ "threads (%i runs).",
+ 32 * M, 32 * N, nr_threads, runs);
+
+ test_qr(M, N, K, nr_threads, runs, NULL);
+}
diff --git a/examples/test_qr_ompss.c b/examples/test_qr_ompss.c
index 014d1e0d2b0227379c69917690c3e270c4179979..8d9286775f01d4ac3b8514f6a657c20f2eede68f 100644
--- a/examples/test_qr_ompss.c
+++ b/examples/test_qr_ompss.c
@@ -1,22 +1,21 @@
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
- *
+ *
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
- *
+ *
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
- *
+ *
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
- *
- ******************************************************************************/
-
+ *
+* *****************************************************************************/
/* Config parameters. */
#include "../config.h"
@@ -33,262 +32,378 @@
#include <lapacke.h>
#include <cblas.h>
-
/* Local includes. */
#include "cycle.h"
-
/* Error macro. */
-#define error(s, ...) { fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); abort(); }
+#define error(s, ...) \
+ { \
+ fprintf(stderr, "%s:%s():%i: " s "\n", __FILE__, __FUNCTION__, __LINE__, \
+ ##__VA_ARGS__); \
+ abort(); \
+ }
/* Message macro. */
-#define message(s, ...) { printf( "%s: " s "\n" , __FUNCTION__ , ##__VA_ARGS__ ); fflush(stdout); }
-
+#define message(s, ...) \
+ { \
+ printf("%s: " s "\n", __FUNCTION__, ##__VA_ARGS__); \
+ fflush(stdout); \
+ }
/* Stuff to collect task data. */
struct timer {
- int threadID, type;
- ticks tic, toc;
- };
-struct timer *timers;
+ int threadID, type;
+ ticks tic, toc;
+};
+struct timer* timers;
int nr_timers = 0;
-/*
- * Sam's routines for the tiled QR decomposition.
- */
-
-/*
- \brief Computes 2-norm of a vector \f$x\f$
-
- Computes the 2-norm by computing the following: \f[\textrm{2-norm}=\sqrt_0^lx(i)^2\f]
- */
-double do2norm(double* x, int l)
-{
- double sum = 0, norm;
- int i;
-
- for(i = 0; i < l; i++)
- sum += x[i] * x[i];
+double* columnToTile(double* columnMatrix, int size, int m, int n,
+ int tilesize) {
+ double* TileMatrix;
+ TileMatrix = malloc(sizeof(double) * size);
+ if (TileMatrix == NULL) error("failed to allocate TileMatrix");
+ int i, j, k, l;
+
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < m; j++) {
+ double* tileStart =
+ &columnMatrix[i * m * tilesize * tilesize + j * tilesize];
+ double* tilePos =
+ &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+ for (k = 0; k < tilesize; k++) {
+ tileStart = &columnMatrix[i * m * tilesize * tilesize +
+ k * m * tilesize + j * tilesize];
+ for (l = 0; l < tilesize; l++) {
+ tilePos[k * tilesize + l] = tileStart[l];
+ }
+ }
+ }
+ }
- norm = sqrt(sum);
+ return TileMatrix;
+}
- return norm;
+double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
+ double* ColumnMatrix;
+ ColumnMatrix = (double*)malloc(sizeof(double) * size);
+ if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
+ int i, j, k, l;
+ for (i = 0; i < n; i++) {
+ for (j = 0; j < m; j++) {
+ /* Tile on ith column is at i*m*32*32.*/
+ /* Tile on jth is at j*32*32 */
+ double* tile =
+ &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
+ /* Column starts at same position as tile. */
+ /* Row j*32.*/
+ double* tilePos =
+ &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
+ for (k = 0; k < tilesize; k++) {
+ for (l = 0; l < tilesize; l++) {
+ tilePos[l] = tile[l];
+ }
+ /* Next 32 elements are the position of the tile in the next column.*/
+ tile = &tile[tilesize];
+ /* Move to the j*32th position in the next column. */
+ tilePos = &tilePos[tilesize * m];
+ }
+ }
+ }
+ return ColumnMatrix;
}
+/* Routines for the tiled QR decomposition.*/
+
/**
- * \brief Computes a Householder reflector from a pair of vectors from coupled blocks
*
- * Calculates the Householder vector of the vector formed by a column in a pair of coupled blocks.
- * There is a single non-zero element, in the first row, of the top vector. This is passed as topDiag
+ * @brief Computes the QR decomposition of a tile.
+ *
+ * @param cornerTile A pointer to the tile for which the decomposition is
+* computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param k The value of k for the QR decomposition
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
*
- * \param topDiag The only non-zero element of the incoming vector in the top block
- * \param ma The number of elements in the top vector
- * \param xb Pointer to the lower vector
- * \param l The number of elements in the whole vector
- * \param vk A pointer to a pre-allocated array to store the householder vector of size l
*
- * \returns void
*/
-void calcvkDouble (double topDiag,
- int ma,
- double* xb,
- int l,
- double* vk)
-{
- int sign, i;
- double norm, div;
- //same non-standard normalisation as for single blocks above, but organised without a temporary beta veriable
-
- sign = topDiag >= 0.0 ? 1 : -1;
- vk[0] = topDiag;
- //use vk[0] as beta
- for(i = 1; i < ma; i++)
- vk[i] = 0;
-
- for(; i < l; i++)
- vk[i] = xb[i - ma];
-
- norm = do2norm(vk, l);
- vk[0] += norm * sign;
-
- if(norm != 0.0)
- {
- div = 1/vk[0];
-
- for(i = 1; i < l; i++)
- vk[i] *= div;
- }
-}
-
-
-void updateDoubleQ_WY (double* blockA,
- double* blockB,
- double* blockTau,
- int k, int ma, int mb, int n,
- int ldm,
- double* hhVector)//bottom, essential part.
-{
- int i, j;
-
- double tau = 1.0, beta;
-
- /* Compute tau = 2/v'v */
- for(i = 0; i < mb; i ++)
- tau += hhVector[i] * hhVector[i];
-
- tau = 2/tau;
-
- for(j = k; j < n; j ++)
- {
- /* Compute v'*b_j */
- beta = blockA[(j*ldm) + k];
+#pragma omp task inout(cornerTile[0]) out(tauMatrix[0])
+void DGEQRF(double* restrict cornerTile, int tileSize,
+ double* restrict tauMatrix, int k, int tauNum) {
+ int i, j, n;
+ double norm = 0.0, sign, u1, tau, z;
+ double w[tileSize];
+
+ /* Timer stuff. */
+ int ind = __sync_fetch_and_add( &nr_timers , 1 );
+ timers[ind].threadID = omp_get_thread_num();
+ timers[ind].type = 0;
+ timers[ind].tic = getticks();
+
+ /*Find the householder vector for each row. */
+ for (i = 0; i < tileSize; i++) {
+ norm = 0.0;
+ /*Fill w with the vector.*/
+ for (j = i; j < tileSize; j++) {
+ /* ith row is i*tileSize, only want elements on diagonal or below.*/
+ w[j] = cornerTile[i * tileSize + j];
+ /*Find the norm as well */
+ norm = norm + w[j] * w[j];
+ }
+ if (w[i] >= 0.0)
+ sign = -1;
+ else
+ sign = 1;
- /* Then for lower half */
- for(i = 0; i < mb; i ++)
- beta += blockB[(j*ldm) + i] * hhVector[i];
+ norm = sqrt(norm);
- beta *= tau;
+ u1 = w[i] - sign * norm;
- /* Compute b_j = b_j - beta*v_k */
- blockA[(j*ldm) + k] -= beta;
-
- for(i = 0; i < mb; i ++)
- blockB[(j*ldm) + i] -= beta * hhVector[i];
- }
+ if (u1 != 0.0) {
+ for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
+ } else {
+ for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
+ }
- /* Insert vector below diagonal. */
- for(i = 0; i < mb; i ++)
- blockB[(k*ldm) + i] = hhVector[i];
+ if (norm != 0.0)
+ tau = -sign * u1 / norm;
+ else
+ tau = 0.0;
- blockTau[k] = tau;
-}
+ /*Store the below diagonal vector */
-// #pragma omp task in( blockA[0] ) inout( blockB[0] ) in( blockTau[0] )
-void DTSQRF (double* blockA,
- double* blockB,
- double* blockTau,
- int ma,
- int mb,
- int n,
- int ldm )
-{
- int k;
- double* xVectA, *xVectB;
- double hhVector[ 2*ma ];
-
- int ind = __sync_fetch_and_add( &nr_timers , 1 );
- timers[ind].threadID = omp_get_thread_num();
- timers[ind].type = 2;
- timers[ind].tic = getticks();
-
- xVectA = blockA;
- xVectB = blockB;
-
- for(k = 0; k < n; k++)
- {
- //vk = sign(x[1])||x||_2e1 + x
- //vk = vk/vk[0]
- calcvkDouble(xVectA[0], ma - k, xVectB, (ma + mb) - k, hhVector);//returns essential
-
- //matA(k:ma,k:na) = matA(k:ma,k:na) - (2/(vk.T*vk))*vk*(vk.T*matA(k:ma,k:na)
- //update both blocks, preserving the vectors already stored below the diagonal in the top block and treating them as if they were zeros.
- updateDoubleQ_WY (blockA, blockB,
- blockTau,
- k, ma, mb, n,
- ldm,
- hhVector + ma - k);
-
- xVectA += ldm + 1;
- xVectB += ldm;
- }
- timers[ind].toc = getticks();
+ for (j = i + 1; j < tileSize; j++) {
+ cornerTile[i * tileSize + j] = w[j];
+ }
+ cornerTile[i * tileSize + i] = sign * norm;
+ w[i] = 1.0;
+ /* Apply the householder transformation to the rest of the tile, for
+ * everything to the right of the diagonal. */
+ for (j = i + 1; j < tileSize; j++) {
+ /*Compute w'*A_j*/
+ z = cornerTile[j * tileSize + i];
+ for (n = i + 1; n < tileSize; n++) {
+ z = z + cornerTile[j * tileSize + n] * w[n];
+ }
+ /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
+ for (n = i; n < tileSize; n++) {
+ cornerTile[j * tileSize + n] =
+ cornerTile[j * tileSize + n] - tau * w[n] * z;
+ }
+ }
+ /* Store tau. We're on k*tileSize+ith row. kth column.*/
+ tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
+ }
+ timers[ind].toc = getticks();
}
-// #pragma omp task in( blockV[0] ) in( blockB[0] ) inout( blockA[0] ) in( blockTau[0] )
-void DSSRFT (double* blockV,
- double* blockA, double* blockB,
- double* blockTau,
- int b, int n, int ldm)
-{
- int i, j, k;
-
- double tau, beta;
-
- int ind = __sync_fetch_and_add( &nr_timers , 1 );
- timers[ind].threadID = omp_get_thread_num();
- timers[ind].type = 3;
- timers[ind].tic = getticks();
+/**
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile for which the householder is stored.
+ * @param rowTiles The tile to which the householder is applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
+ *
+ *
+ */
+#pragma omp task in(cornerTile[0]) inout(rowTile[0]) in(tauMatrix[0])
+void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
+ int jj, int kk, double* restrict tauMatrix, int tauNum) {
+ int i, j, n;
+ double z = 0.0;
+ double w[tileSize];
+
+ /* Timer stuff. */
+ int ind = __sync_fetch_and_add( &nr_timers , 1 );
+ timers[ind].threadID = omp_get_thread_num();
+ timers[ind].type = 1;
+ timers[ind].tic = getticks();
- /* Compute b_j = b_j - tau*v*v'*b_j for each column j of blocks A & B,
- and for each householder vector v of blockV */
-
- /* For each column of B */
- for(j = 0; j < n; j ++)
- {
- /* For each householder vector. */
- for(k = 0; k < n; k ++)
- {
- /* tau = 2/v'v, computed earlier, stored in T(k,k). */
- tau = blockTau[k];
-
- /* Compute beta = v_k'b_j. */
- /* v_k is >0 (=1) only at position k in top half. */
- beta = blockA[(j*ldm) + k];
-
- /* For lower portion of v_k, aligning with the lower block */
- for(i = 0; i < b; i ++)
- beta += blockB[(j*ldm) + i] * blockV[(k*ldm) + i];
-
- beta *= tau;
-
- /* Compute b_j = b_j - beta * v */
- /* v_k = 1 at (k) in top half again */
- blockA[(j*ldm) + k] -= beta;
-
- /* Apply to bottom block. */
- for(i = 0; i < b; i ++)
- blockB[(j*ldm) + i] -= beta * blockV[(k*ldm) + i];
- }
- }
- timers[ind].toc = getticks();
+ /* For each row in the corner Tile*/
+ for (i = 0; i < tileSize; i++) {
+ /*Get w for row i */
+ for (j = i; j < tileSize; j++) {
+ w[j] = cornerTile[i * tileSize + j];
+ }
+ w[i] = 1.0;
+
+ /* Apply to the row Tile */
+ for (j = 0; j < tileSize; j++) {
+ z = 0.0;
+ /* Below Diagonal!*/
+ /*Compute w'*A_j*/
+ for (n = i; n < tileSize; n++) {
+ z = z + w[n] * rowTile[j * tileSize + n];
+ }
+ for (n = i; n < tileSize; n++) {
+ rowTile[j * tileSize + n] =
+ rowTile[j * tileSize + n] -
+ tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
+ }
+ }
+ }
+ timers[ind].toc = getticks();
}
-
/**
- * @breif Wrapper to get the dependencies right.
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile The corner tile for this value of k.
+ * @param columnTile The tile for which householders are computed.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
+ *
+ *
*/
-
-// #pragma omp task inout( a[0] ) inout( tau[0] )
-void DGEQRF ( int matrix_order, lapack_int m, lapack_int n,
- double* a, lapack_int lda, double* tau ) {
- int ind = __sync_fetch_and_add( &nr_timers , 1 );
- timers[ind].threadID = omp_get_thread_num();
- timers[ind].type = 0;
- timers[ind].tic = getticks();
- LAPACKE_dgeqrf( matrix_order, m, n, a, lda, tau );
- timers[ind].toc = getticks();
+#pragma omp task inout(cornerTile[0]) inout(columnTile[0]) out(tauMatrix[0])
+void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
+ int tilesize, int ii, int kk, double* restrict tauMatrix,
+ int tauNum) {
+ int i, j, n;
+ double norm = 0.0, sign, u1, tau, z;
+ double w[2*tilesize];
+
+ /* Timer stuff. */
+ int ind = __sync_fetch_and_add( &nr_timers , 1 );
+ timers[ind].threadID = omp_get_thread_num();
+ timers[ind].type = 2;
+ timers[ind].tic = getticks();
+
+ /* For each column compute the householder vector. */
+ for (i = 0; i < tilesize; i++) {
+ norm = 0.0;
+ w[i] = cornerTile[i * tilesize + i];
+ norm = norm + w[i] * w[i];
+ for (j = i + 1; j < tilesize; j++) {
+ w[j] = 0.0;
+ }
+ for (j = 0; j < tilesize; j++) {
+ w[tilesize + j] = columnTile[i * tilesize + j];
+ norm = norm + w[tilesize + j] * w[tilesize + j];
}
+ norm = sqrt(norm);
+ if (w[i] >= 0.0)
+ sign = -1;
+ else
+ sign = 1;
+
+ u1 = w[i] - sign * norm;
+ if (u1 != 0.0) {
+ for (j = i + 1; j < 2 * tilesize; j++) {
+ w[j] = w[j] / u1;
+ }
+ } else {
+ for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
+ }
+
+ if (norm != 0)
+ tau = -sign * u1 / norm;
+ else
+ tau = 0.0;
+
+ /* Apply to each row to the right.*/
+ for (j = i; j < tilesize; j++) {
+ /* Find w'*A_j, w is 0s except for first value with upper tile.*/
+ z = 1.0 * cornerTile[j * tilesize + i];
+ for (n = 0; n < tilesize; n++) {
+ z = z + w[tilesize + n] * columnTile[j * tilesize + n];
+ }
+ /* Apply to upper tile.*/
+ cornerTile[j * tilesize + i] =
+ cornerTile[j * tilesize + i] - tau * 1.0 * z;
+ for (n = i + 1; n < tilesize; n++) {
+ cornerTile[j * tilesize + n] =
+ cornerTile[j * tilesize + n] - tau * w[n] * z;
+ }
+ /* Apply to lower tile.*/
+ for (n = 0; n < tilesize; n++) {
+ columnTile[j * tilesize + n] =
+ columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
+ }
+ }
+ /* Store w*/
+ for (j = 0; j < tilesize; j++) {
+ columnTile[i * tilesize + j] = w[tilesize + j];
+ }
+ tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
+ }
+ timers[ind].toc = getticks();
+}
/**
- * @breif Wrapper to get the dependencies right.
+ *
+ * @brief Applies the householder factorisation of the corner to the row tile.
+ *
+ * @param cornerTile A pointer to the tile to have the householder applied.
+ * @param columnTile The tile in which the householders are stored.
+ * @param rowTile The upper tile to have the householders applied.
+ * @param tilesize The number of elements on a row/column of the tile.
+ * @param tauMatrix A pointer to the tau Matrix.
+ * @param ii The value of i for the QR decomposition.
+ * @param jj The value of j for the QR decomposition.
+ * @param kk The value of k for the QR decomposition.
+ * @param tauNum The number of tau values stored for each row of the matrix
+* (this is equal to the number of tiles on each column).
+ *
+ *
*/
-
-// #pragma omp task inout( v[0] ) in( tau[0] ) inout( t[0] )
-void DLARFT ( int matrix_order, char direct, char storev,
- lapack_int n, lapack_int k, const double* v,
- lapack_int ldv, const double* tau, double* t,
- lapack_int ldt ) {
- int ind = __sync_fetch_and_add( &nr_timers , 1 );
- timers[ind].threadID = omp_get_thread_num();
- timers[ind].type = 1;
- timers[ind].tic = getticks();
- LAPACKE_dlarft_work( matrix_order, direct, storev, n, k, v, ldv, tau, t, ldt );
- timers[ind].toc = getticks();
+#pragma omp task inout(cornerTile[0]) in(columnTile[0]) inout(rowTile[0]) in(tauMatrix[0])
+void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
+ double* restrict rowTile, int tilesize, int ii, int jj, int kk,
+ double* restrict tauMatrix, int tauNum) {
+ int i, j, n;
+ double z;
+ double w[2*tilesize];
+
+ /* Timer stuff. */
+ int ind = __sync_fetch_and_add( &nr_timers , 1 );
+ timers[ind].threadID = omp_get_thread_num();
+ timers[ind].type = 3;
+ timers[ind].tic = getticks();
+
+ for (i = 0; i < tilesize; i++) {
+ for (j = 0; j < i; j++) w[j] = 0.0;
+ w[i] = 1.0;
+ for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
+ for (j = 0; j < tilesize; j++)
+ w[j + tilesize] = columnTile[i * tilesize + j];
+
+ /* Apply householder vector (w) to the tiles.*/
+ for (j = 0; j < tilesize; j++) {
+ z = 0.0;
+ /* Compute w' * A_j */
+ for (n = 0; n < tilesize; n++) {
+ z += w[n] * rowTile[j * tilesize + n];
+ z += w[n + tilesize] * cornerTile[j * tilesize + n];
+ }
+ for (n = 0; n < tilesize; n++) {
+ rowTile[j * tilesize + n] =
+ rowTile[j * tilesize + n] -
+ tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
+ cornerTile[j * tilesize + n] =
+ cornerTile[j * tilesize + n] -
+ tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
+ }
}
-
-
+ }
+ timers[ind].toc = getticks();
+}
+
/**
* @brief Computed a tiled QR factorization using QuickSched.
*
@@ -296,151 +411,140 @@ void DLARFT ( int matrix_order, char direct, char storev,
* @param n Number of tile columns.
* @param nr_threads Number of threads to use.
*/
-
-void test_qr ( int m , int n , int K , int nr_threads , int runs ) {
-
- int k, j, i, r;
- double *A, *A_orig, *tau;
- ticks tic, toc_run, tot_setup, tot_run = 0;
- int tid[ m*n ];
-
-
- /* Allocate and fill the original matrix. */
- if ( ( A = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL ||
- ( tau = (double *)malloc( sizeof(double) * m * n * K ) ) == NULL ||
- ( A_orig = (double *)malloc( sizeof(double) * m * n * K * K ) ) == NULL )
- error( "Failed to allocate matrices." );
- for ( k = 0 ; k < m * n * K * K ; k++ )
- A_orig[k] = 2*((double)rand()) / RAND_MAX - 1.0;
- memcpy( A , A_orig , sizeof(double) * m * n * K * K );
- bzero( tau , sizeof(double) * m * n * K );
-
- /* Dump A_orig. */
- /* message( "A_orig = [" );
- for ( k = 0 ; k < m*K ; k++ ) {
- for ( j = 0 ; j < n*K ; j++ )
- printf( "%.3f " , A_orig[ j*m*K + k ] );
- printf( "\n" );
- }
- printf( "];\n" ); */
-
- /* Loop over the number of runs. */
- for ( r = 0 ; r < runs ; r++ ) {
-
- /* Start the clock. */
- tic = getticks();
- nr_timers = 0;
-
- /* Launch the tasks. */
- for ( k = 0 ; k < m && k < n ; k++ ) {
-
- /* Add kth corner task. */
- #pragma omp task inout( tid[ k*m + k ] )
- DGEQRF( LAPACK_COL_MAJOR , K, K ,
- &A[ k*m*K*K + k*K ] , m*K , &tau[ k*m*K + k*K ] );
-
- /* Add column tasks on kth row. */
- for ( j = k+1 ; j < n ; j++ ) {
- #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] )
- DLARFT( LAPACK_COL_MAJOR , 'F' , 'C' ,
- K , K , &A[ k*m*K*K + k*K ] ,
- m*K , &tau[ k*m*K + k*K ] , &A[ j*m*K*K + k*K ] ,
- m*K );
- }
-
- /* For each following row... */
- for ( i = k+1 ; i < m ; i++ ) {
-
- /* Add the row taks for the kth column. */
- #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] )
- DTSQRF( &A[ k*m*K*K + k*K ] , &A[ k*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K , K*m );
-
- /* Add the inner tasks. */
- for ( j = k+1 ; j < n ; j++ ) {
- #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[ j*m + k ] )
- DSSRFT( &A[ k*m*K + i*K ] , &A[ j*m*K*K + k*K ] , &A[ j*m*K*K + i*K ] , &tau[ k*m*K + i*K ] , K , K , K*m );
- }
-
- }
-
- } /* build the tasks. */
-
- /* Collect timers. */
- #pragma omp taskwait
- toc_run = getticks();
- message( "%ith run took %lli ticks..." , r , toc_run - tic );
- tot_run += toc_run - tic;
-
+
+void test_qr(int m, int n, int K, int nr_threads, int runs) {
+
+ int k, j, i, r;
+ double* A, *A_orig, *tau;
+ ticks tic, toc_run, tot_run = 0;
+
+ /* Allocate and fill the original matrix. */
+ if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
+ (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
+ (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
+ error("Failed to allocate matrices.");
+ for (k = 0; k < m * n * K * K; k++)
+ A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
+ memcpy(A, A_orig, sizeof(double) * m * n * K * K);
+ bzero(tau, sizeof(double) * m * n * K);
+
+ /* Dump A_orig. */
+ /* message( "A_orig = [" );
+ for ( k = 0 ; k < m*K ; k++ ) {
+ for ( j = 0 ; j < n*K ; j++ )
+ printf( "%.3f " , A_orig[ j*m*K + k ] );
+ printf( "\n" );
+ }
+ printf( "];\n" ); */
+
+ /* Loop over the number of runs. */
+ for (r = 0; r < runs; r++) {
+
+ /* Start the clock. */
+ tic = getticks();
+ nr_timers = 0;
+
+ /* Launch the tasks. */
+ for (k = 0; k < m && k < n; k++) {
+
+ /* Add kth corner task. */
+ // #pragma omp task inout( tid[ k*m + k ] )
+ DGEQRF(&A[(k * m + k) * K * K], K, tau, k, m);
+
+ /* Add column tasks on kth row. */
+ for (j = k + 1; j < n; j++) {
+ // #pragma omp task inout( tid[ j*m + k ] ) in( tid[ k*m + k ] )
+ DLARFT(&A[(k * m + k) * K * K], &A[(j * m + k) * K * K], K, j, k, tau,
+ m);
+ }
+
+ /* For each following row... */
+ for (i = k + 1; i < m; i++) {
+
+ /* Add the row taks for the kth column. */
+ // #pragma omp task inout( tid[ k*m + i ] ) in( tid[ k*m + k ] )
+ DTSQRF(&A[(k * m + k) * K * K], &A[(k * m + i) * K * K], K, i, k, tau,
+ m);
+
+ /* Add the inner tasks. */
+ for (j = k + 1; j < n; j++) {
+ // #pragma omp task inout( tid[ j*m + i ] ) in( tid[ k*m + i ] , tid[
+ // j*m + k ] )
+ DSSRFT(&A[(j * m + i) * K * K], &A[(k * m + i) * K * K],
+ &A[(j * m + k) * K * K], K, i, j, k, tau, m);
}
-
- /* Dump the costs. */
- message( "costs: setup=%lli ticks, run=%lli ticks." ,
- tot_setup , tot_run/runs );
-
- /* Dump the tasks. */
- /* for ( k = 0 ; k < nr_timers ; k++ )
- printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type , timers[k].tic , timers[k].toc ); */
-
- }
+ }
+
+ } /* build the tasks. */
+
+/* Collect timers. */
+#pragma omp taskwait
+ toc_run = getticks();
+ message("%ith run took %lli ticks...", r, toc_run - tic);
+ tot_run += toc_run - tic;
+ }
+ /* Dump the costs. */
+ message("costs: run=%lli ticks.", tot_run / runs);
+
+ /* Dump the tasks. */
+ /* for ( k = 0 ; k < nr_timers ; k++ )
+ printf( "%i %i %lli %lli\n" , timers[k].threadID , timers[k].type ,
+ timers[k].tic , timers[k].toc ); */
+}
/**
* @brief Main function.
*/
-
-int main ( int argc , char *argv[] ) {
- int c, nr_threads;
- int M = 4, N = 4, runs = 1, K = 32;
-
- /* Get the number of threads. */
- #pragma omp parallel shared(nr_threads)
- {
- if ( omp_get_thread_num() == 0 )
- nr_threads = omp_get_num_threads();
+int main(int argc, char* argv[]) {
+
+ int c, nr_threads;
+ int M = 4, N = 4, runs = 1, K = 32;
+
+/* Get the number of threads. */
+#pragma omp parallel shared(nr_threads)
+ {
+ if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
+ }
+
+ /* Parse the options */
+ while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
+ case 'm':
+ if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
+ break;
+ case 'n':
+ if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
+ break;
+ case 'k':
+ if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
+ break;
+ case 'r':
+ if (sscanf(optarg, "%d", &runs) != 1)
+ error("Error parsing number of runs.");
+ break;
+ case 't':
+ if (sscanf(optarg, "%d", &nr_threads) != 1)
+ error("Error parsing number of threads.");
+ omp_set_num_threads(nr_threads);
+ break;
+ case '?':
+ fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
+ argv[0]);
+ fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
+ "matrix using nr_threads threads.\n");
+ exit(EXIT_FAILURE);
}
-
- /* Parse the options */
- while ( ( c = getopt( argc , argv , "m:n:k:r:t:" ) ) != -1 )
- switch( c ) {
- case 'm':
- if ( sscanf( optarg , "%d" , &M ) != 1 )
- error( "Error parsing dimension M." );
- break;
- case 'n':
- if ( sscanf( optarg , "%d" , &N ) != 1 )
- error( "Error parsing dimension M." );
- break;
- case 'k':
- if ( sscanf( optarg , "%d" , &K ) != 1 )
- error( "Error parsing tile size." );
- break;
- case 'r':
- if ( sscanf( optarg , "%d" , &runs ) != 1 )
- error( "Error parsing number of runs." );
- break;
- case 't':
- if ( sscanf( optarg , "%d" , &nr_threads ) != 1 )
- error( "Error parsing number of threads." );
- omp_set_num_threads( nr_threads );
- break;
- case '?':
- fprintf( stderr , "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n" , argv[0] );
- fprintf( stderr , "Computes the tiled QR decomposition of an MxN tiled\n"
- "matrix using nr_threads threads.\n" );
- exit( EXIT_FAILURE );
- }
-
- /* Dump arguments. */
- message( "Computing the tiled QR decomposition of a %ix%i matrix using %i threads (%i runs)." ,
- 32*M , 32*N , nr_threads , runs );
-
- /* Initialize the timers. */
- if ( ( timers = (struct timer *)malloc( sizeof(struct timer) * M*M*N ) ) == NULL )
- error( "Failed to allocate timers." );
-
- test_qr( M , N , K , nr_threads , runs );
-
- }
-
-
+
+ /* Dump arguments. */
+ message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
+ "threads (%i runs).",
+ 32 * M, 32 * N, nr_threads, runs);
+
+ /* Initialize the timers. */
+ if ((timers = (struct timer*)malloc(sizeof(struct timer) * M * M * N)) ==
+ NULL)
+ error("Failed to allocate timers.");
+
+ test_qr(M, N, K, nr_threads, runs);
+}