-
d74ksy authored
Added some stuff for the qr and bh testcases as diagnostics. qr now has task timers output. Bug in mpirun on gtx690 means i can only run with 1 core inside the mpirun wrapper. There is probably a bug in the QR decomposition as it scales far too well...
d74ksy authoredAdded some stuff for the qr and bh testcases as diagnostics. qr now has task timers output. Bug in mpirun on gtx690 means i can only run with 1 core inside the mpirun wrapper. There is probably a bug in the QR decomposition as it scales far too well...
test_qr_mpi.c 26.45 KiB
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* *****************************************************************************/
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>
#include <fenv.h>
#include <mpi.h>
#ifdef WITH_CLBAS
#include <cblas.h>
#endif
/* Local includes. */
#include "quicksched.h"
#include "res.h"
#define TASK_TIMERS
#ifdef WITH_CBLAS
/**
* Takes a column major matrix, NOT tile major. size is length of a side of the
* matrix. Only works for square matrices.
* This function is simply for validation and is implemented naively as we know
* of no implementation to retrieve Q from the tiled QR.
*/
double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
double* Q = malloc(sizeof(double) * size * size);
double* Qtemp = malloc(sizeof(double) * size * size);
double* w = malloc(sizeof(double) * size);
double* ww = malloc(sizeof(double) * size * size);
double* temp = malloc(sizeof(double) * size * size);
int i, k, l, j, n;
bzero(Q, sizeof(double) * size * size);
bzero(Qtemp, sizeof(double) * size * size);
bzero(ww, sizeof(double) * size * size);
for (i = 0; i < size; i++) {
Q[i * size + i] = 1.0;
}
int numcoltile = size / tilesize;
int numrowtile = size / tilesize;
for (k = 0; k < numrowtile; k++) {
for (l = 0; l < tilesize; l++) {
bzero(Qtemp, sizeof(double) * size * size);
for (i = 0; i < size; i++) {
Qtemp[i * size + i] = 1.0;
}
for (i = k; i < numcoltile; i++) {
bzero(w, sizeof(double) * size);
for (j = 0; j < tilesize; j++) {
w[i * tilesize + j] =
HR[(k * tilesize + l) * size + i * tilesize + j];
}
w[k * tilesize + l] = 1.0;
if (k * tilesize + l > i * tilesize) {
for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
}
/* Compute (I - tau*w*w')' */
for (j = 0; j < size; j++) {
for (n = 0; n < size; n++) {
if (j != n)
ww[n * size + j] =
-tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
else
ww[n * size + j] =
1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
}
}
/* Qtemp = Qtemp * (I-tau*w*w')' */
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
1.0, Qtemp, size, ww, size, 0.0, temp, size);
double* b = Qtemp;
Qtemp = temp;
temp = b;
}
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
1.0, Q, size, Qtemp, size, 0.0, temp, size);
double* b = Q;
Q = temp;
temp = b;
}
}
free(Qtemp);
free(w);
free(ww);
free(temp);
return Q;
}
#endif
double* getR(double* HR, int size) {
double* R = malloc(sizeof(double) * size * size);
int i, j;
bzero(R, sizeof(double) * size * size);
for (i = 0; i < size; i++) {
for (j = 0; j <= i; j++) {
R[i * size + j] = HR[i * size + j];
}
}
return R;
}
void printMatrix(double* Matrix, int m, int n, int tilesize) {
int i, j;
for (i = 0; i < m * tilesize; i++) {
for (j = 0; j < n * tilesize; j++) {
printf(" %.3f ", Matrix[j * m * tilesize + i]);
}
printf("\n");
}
}
double* columnToTile(double* columnMatrix, int size, int m, int n,
int tilesize) {
double* TileMatrix;
TileMatrix = malloc(sizeof(double) * size);
if (TileMatrix == NULL) error("failed to allocate TileMatrix");
int i, j, k, l;
for (i = 0; i < n; i++) {
for (j = 0; j < m; j++) {
double* tileStart =
&columnMatrix[i * m * tilesize * tilesize + j * tilesize];
double* tilePos =
&TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
for (k = 0; k < tilesize; k++) {
tileStart = &columnMatrix[i * m * tilesize * tilesize +
k * m * tilesize + j * tilesize];
for (l = 0; l < tilesize; l++) {
tilePos[k * tilesize + l] = tileStart[l];
}
}
}
}
return TileMatrix;
}
double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
double* ColumnMatrix;
ColumnMatrix = (double*)malloc(sizeof(double) * size);
if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
int i, j, k, l;
for (i = 0; i < n; i++) {
for (j = 0; j < m; j++) {
/* Tile on ith column is at i*m*32*32.*/
/* Tile on jth is at j*32*32 */
double* tile =
&tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
/* Column starts at same position as tile. */
/* Row j*32.*/
double* tilePos =
&ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
for (k = 0; k < tilesize; k++) {
for (l = 0; l < tilesize; l++) {
tilePos[l] = tile[l];
}
/* Next 32 elements are the position of the tile in the next column.*/
tile = &tile[tilesize];
/* Move to the j*32th position in the next column. */
tilePos = &tilePos[tilesize * m];
}
}
}
return ColumnMatrix;
}
/* Routines for the tiled QR decomposition.*/
/**
*
* @brief Computes the QR decomposition of a tile.
*
* @param cornerTile A pointer to the tile for which the decomposition is
* computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param k The value of k for the QR decomposition
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DGEQRF(double* restrict cornerTile, int tileSize,
double* restrict tauMatrix, int k, int tauNum) {
int i, j, n;
double norm = 0.0, sign, u1, tau, z;
double w[tileSize];
/*Find the householder vector for each row. */
for (i = 0; i < tileSize; i++) {
norm = 0.0;
/*Fill w with the vector.*/
for (j = i; j < tileSize; j++) {
/* ith row is i*tileSize, only want elements on diagonal or below.*/
w[j] = cornerTile[i * tileSize + j];
/*Find the norm as well */
norm = norm + w[j] * w[j];
}
if (w[i] >= 0.0)
sign = -1;
else
sign = 1;
norm = sqrt(norm);
u1 = w[i] - sign * norm;
if (u1 != 0.0) {
for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
} else {
for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
}
if (norm != 0.0)
tau = -sign * u1 / norm;
else
tau = 0.0;
/*Store the below diagonal vector */
for (j = i + 1; j < tileSize; j++) {
cornerTile[i * tileSize + j] = w[j];
}
cornerTile[i * tileSize + i] = sign * norm;
w[i] = 1.0;
/* Apply the householder transformation to the rest of the tile, for
* everything to the right of the diagonal. */
for (j = i + 1; j < tileSize; j++) {
/*Compute w'*A_j*/
z = cornerTile[j * tileSize + i];
for (n = i + 1; n < tileSize; n++) {
z = z + cornerTile[j * tileSize + n] * w[n];
}
/* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
for (n = i; n < tileSize; n++) {
cornerTile[j * tileSize + n] =
cornerTile[j * tileSize + n] - tau * w[n] * z;
}
}
/* Store tau. We're on k*tileSize+ith row. kth column.*/
// tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
tauMatrix[i] = tau;
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile A pointer to the tile for which the householder is stored.
* @param rowTiles The tile to which the householder is applied.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
int jj, int kk, double* restrict tauMatrix, int tauNum) {
int i, j, n;
double z = 0.0;
double w[tileSize];
/* For each row in the corner Tile*/
for (i = 0; i < tileSize; i++) {
/*Get w for row i */
for (j = i; j < tileSize; j++) {
w[j] = cornerTile[i * tileSize + j];
}
w[i] = 1.0;
/* Apply to the row Tile */
for (j = 0; j < tileSize; j++) {
z = 0.0;
/* Below Diagonal!*/
/*Compute w'*A_j*/
for (n = i; n < tileSize; n++) {
z = z + w[n] * rowTile[j * tileSize + n];
}
for (n = i; n < tileSize; n++) {
rowTile[j * tileSize + n] =
rowTile[j * tileSize + n] -
//tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
tauMatrix[i] * w[n] * z;
}
}
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile The corner tile for this value of k.
* @param columnTile The tile for which householders are computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param ii The value of i for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
int tilesize, int ii, int kk, double* restrict tauMatrix,
int tauNum) {
int i, j, n;
double norm = 0.0, sign, u1, tau, z;
double w[2*tilesize];
/* For each column compute the householder vector. */
for (i = 0; i < tilesize; i++) {
norm = 0.0;
w[i] = cornerTile[i * tilesize + i];
norm = norm + w[i] * w[i];
for (j = i + 1; j < tilesize; j++) {
w[j] = 0.0;
}
for (j = 0; j < tilesize; j++) {
w[tilesize + j] = columnTile[i * tilesize + j];
norm = norm + w[tilesize + j] * w[tilesize + j];
}
norm = sqrt(norm);
if (w[i] >= 0.0)
sign = -1;
else
sign = 1;
u1 = w[i] - sign * norm;
if (u1 != 0.0) {
for (j = i + 1; j < 2 * tilesize; j++) {
w[j] = w[j] / u1;
}
} else {
for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
}
if (norm != 0)
tau = -sign * u1 / norm;
else
tau = 0.0;
/* Apply to each row to the right.*/
for (j = i; j < tilesize; j++) {
/* Find w'*A_j, w is 0s except for first value with upper tile.*/
z = 1.0 * cornerTile[j * tilesize + i];
for (n = 0; n < tilesize; n++) {
z = z + w[tilesize + n] * columnTile[j * tilesize + n];
}
/* Apply to upper tile.*/
cornerTile[j * tilesize + i] =
cornerTile[j * tilesize + i] - tau * 1.0 * z;
for (n = i + 1; n < tilesize; n++) {
cornerTile[j * tilesize + n] =
cornerTile[j * tilesize + n] - tau * w[n] * z;
}
/* Apply to lower tile.*/
for (n = 0; n < tilesize; n++) {
columnTile[j * tilesize + n] =
columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
}
}
/* Store w*/
for (j = 0; j < tilesize; j++) {
columnTile[i * tilesize + j] = w[tilesize + j];
}
// tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
tauMatrix[i] = tau;
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile A pointer to the tile to have the householder applied.
* @param columnTile The tile in which the householders are stored.
* @param rowTile The upper tile to have the householders applied.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param ii The value of i for the QR decomposition.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
double* restrict rowTile, int tilesize, int ii, int jj, int kk,
double* restrict tauMatrix, int tauNum) {
int i, j, n;
double z;
double w[2*tilesize];
for (i = 0; i < tilesize; i++) {
for (j = 0; j < i; j++) w[j] = 0.0;
w[i] = 1.0;
for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
for (j = 0; j < tilesize; j++)
w[j + tilesize] = columnTile[i * tilesize + j];
/* Apply householder vector (w) to the tiles.*/
for (j = 0; j < tilesize; j++) {
z = 0.0;
/* Compute w' * A_j */
for (n = 0; n < tilesize; n++) {
z += w[n] * rowTile[j * tilesize + n];
z += w[n + tilesize] * cornerTile[j * tilesize + n];
}
for (n = 0; n < tilesize; n++) {
rowTile[j * tilesize + n] =
rowTile[j * tilesize + n] -
//tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
tauMatrix[i] * w[n] * z;
cornerTile[j * tilesize + n] =
cornerTile[j * tilesize + n] -
// tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
tauMatrix[i] * w[tilesize+ n] * z;
}
}
}
}
void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
int k, j, i;
double* A, *A_orig= NULL, *tau;
struct qsched s;
qsched_task_t* tid = NULL, tid_new = -1;
qsched_res_t *rid = NULL, *tau_id = NULL;
double **rids = NULL, **taus = NULL;
int data[3];
#ifdef TASK_TIMERS
long long int MPI_data[7];
#else
long long int MPI_data[6];
#endif
// ticks tic, toc_run, tot_setup, tot_run = 0;
#ifdef TASK_TIMERS
ticks tic, toc_run;
#endif
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
#ifdef TASK_TIMERS
long long int *task_start;
long long int *task_finish;
int *task_tids;
int *task_types;
#endif
// Initialize the MPI environment
int MpiThreadLevel;
MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &MpiThreadLevel);
if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
error("We didn't get thread multiple!");
MPI_Get_processor_name(processor_name, &name_len);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN );
enum task_types {
task_DGEQRF=1,
task_DLARFT,
task_DTSQRF,
task_DSSRFT
};
printf("Task_DGEQRF = %i\n", task_DGEQRF);
/* Runner function to pass to the scheduler. */
void runner(struct qsched *s, int type, void * data) {
#ifdef TASK_TIMERS
tic = getticks();
#endif
/* Decode the task data. */
long long int* idata = (long long int*)data;
// int i = idata[0], j = idata[1], k = idata[2];
long long int i, j , a , t;
long long int tid = -1;
/* Need to pull the resources.*/
/* Decode and execute the task. */
switch (type) {
case task_DGEQRF:
i = idata[0];
j = idata[1];
double *cornerTile = (double*)qsched_getresdata(s, i);
double *tau = (double*)qsched_getresdata(s, j);
DGEQRF(cornerTile, K, tau, k, m);
#ifdef TASK_TIMERS
tid = (idata[2] * m * m);
#endif
break;
case task_DLARFT:
t = idata[2];
i = idata[0];
j = idata[1];
double *lockedTile = (double*)qsched_getresdata(s, i);
cornerTile = (double*)qsched_getresdata(s, j);
tau = (double*)qsched_getresdata(s, t);
DLARFT(cornerTile, lockedTile, K, j, k, tau,
m);
#ifdef TASK_TIMERS
tid = (idata[3] * m * m) + (idata[4] * m);
#endif
break;
case task_DTSQRF:
i = idata[0];
j = idata[1];
t = idata[2];
lockedTile = (double*)qsched_getresdata(s, i);
cornerTile = (double*)qsched_getresdata(s, j);
tau = (double*)qsched_getresdata(s, t);
DTSQRF(cornerTile, lockedTile, K, i, k, tau,
m);
#ifdef TASK_TIMERS
tid = (idata[3] * m * m) + (idata[3] * m) + idata[4];
#endif
break;
case task_DSSRFT:
a = idata[2];
i = idata[0];
j = idata[1];
t = idata[3];
double *lockedTile1 = (double*)qsched_getresdata(s, i);
double *usedTile = (double*)qsched_getresdata(s, j);
double *lockedTile2 = (double*)qsched_getresdata(s, a);
tau = (double*)qsched_getresdata(s, t);
DSSRFT(lockedTile1, usedTile,
lockedTile2, K, i, j, k, tau, m);
#ifdef TASK_TIMERS
tid = (idata[4] * m * m) + (idata[6] * m) + (idata[5]);
#endif
break;
// default:
// error("Unknown task type.");
}
#ifdef TASK_TIMERS
if(type > 0){
toc_run = getticks();
task_start[tid] = tic;
task_finish[tid] = toc_run;
task_tids[tid] = omp_get_thread_num();
task_types[tid] = type;
}
#endif
}
qsched_init(&s, 2, qsched_flag_none, MPI_COMM_WORLD);
#ifdef TASK_TIMERS
task_start = (long long int*)calloc(sizeof(long long int), m*n*K*K);
printf("Created task_start of size %i", m*n*K*K);
task_finish = (long long int*)calloc(sizeof(long long int), m*n*K*K);
task_tids = (int*)calloc(sizeof(int), m*n*K*K);
task_types = (int*)calloc(sizeof(int), m*n*K*K);
#endif
if(s.rank == 0){
/* Allocate and fill the original matrix. */
if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
(tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
(A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
error("Failed to allocate matrices.");
for (k = 0; k < m * n * K * K; k++) {
if (matrix == NULL)
A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
else
A_orig[k] = matrix[k];
}
memcpy(A, A_orig, sizeof(double) * m * n * K * K);
bzero(tau, sizeof(double) * m * n * K);
/* Initialize the scheduler. */
/* Allocate and init the task ID and resource ID matrix. */
// tic = getticks();
if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL ||
(rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL)
error("Failed to allocate tid/rid matrix.");
if( (rids = (double**)malloc(sizeof(double*)*m*n)) == NULL)
error("Failed to allocate rids.");
if( (tau_id = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m*n) ) == NULL)
error("Failed to allocate tau_id matrix.");
if( (taus = (double**)malloc(sizeof(double*)*m*n)) == NULL)
error("Failed to allocate taus.");
for (k = 0; k < m * n; k++) {
//tid[k] = qsched_task_none;
rid[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K * K, (void**) &rids[k]);
tau_id[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K, (void**) &taus[k]);
memcpy(taus[k], tau, sizeof(double) * K);
}
}
message("Synchronizing resources.");
qsched_sync_resources(&s);
if(s.rank == 0) {
for (k = 0; k < m * n; k++) {
tid[k] = qsched_task_none;
}
/* Build the tasks. */
for (k = 0; k < m && k < n; k++) {
/* Add kth corner task. */
data[0] = k;
data[1] = k;
data[2] = k;
MPI_data[0] = rid[k*m+k];
MPI_data[1] = tau_id[k*m+k];
#ifdef TASK_TIMERS
MPI_data[2] = k;
tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
sizeof(long long int) * 3, 300);
#else
tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
sizeof(long long int) * 2, 200);
#endif
qsched_addlock(&s, tid_new, rid[k * m + k]);
qsched_addlock(&s, tid_new, tau_id[k*m+k]);
if(k == 0)
{
memcpy(rids[k*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
tid[k * m + k] = tid_new;
/* Add column tasks on kth row. */
for (j = k + 1; j < n; j++) {
data[0] = k;
data[1] = j;
data[2] = k;
MPI_data[0] = rid[j*m+k];
MPI_data[1] = rid[k*m+k];
MPI_data[2] = tau_id[k*m+k];
#ifdef TASK_TIMERS
MPI_data[3] = k;
MPI_data[4] = j;
tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
sizeof(long long int) * 5, 300);
#else
tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
sizeof(long long int) * 3, 300);
#endif
if(k == 0)
{
memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
qsched_addlock(&s, tid_new, rid[j * m + k]);
qsched_adduse(&s, tid_new, rid[k * m + k]);
qsched_adduse(&s, tid_new, tau_id[k*m+k]);
qsched_addunlock(&s, tid[k * m + k], tid_new);
if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
tid[j * m + k] = tid_new;
}
/* For each following row... */
for (i = k + 1; i < m; i++) {
/* Add the row taks for the kth column. */
data[0] = i;
data[1] = k;
data[2] = k;
MPI_data[0] = rid[k*m+i];
MPI_data[1] = rid[k*m+k];
MPI_data[2] = tau_id[k*m+i];
#ifdef TASK_TIMERS
MPI_data[3] = k;
MPI_data[4] = i;
tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
sizeof(long long int) * 5, 300);
#else
tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
sizeof(long long int) * 3, 300);
#endif
if(k == 0)
{
memcpy(rids[k*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
qsched_addlock(&s, tid_new, rid[k * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + k]);
qsched_addlock(&s, tid_new, tau_id[k*m+i]);
qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
tid[k * m + i] = tid_new;
/* Add the inner tasks. */
for (j = k + 1; j < n; j++) {
data[0] = i;
data[1] = j;
data[2] = k;
MPI_data[0] = rid[j*m+i];
MPI_data[1] = rid[k*m+i];
MPI_data[2] = rid[j*m+k];
MPI_data[3] = tau_id[k*m+i];
#ifdef TASK_TIMERS
MPI_data[4] = k;
MPI_data[5] = i;
MPI_data[6] = j;
tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
sizeof(long long int) * 7, 300);
#else
tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
sizeof(long long int) * 4, 500);
#endif
if(k == 0)
{
memcpy(rids[j*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
qsched_addlock(&s, tid_new, rid[j * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + i]);
qsched_addlock(&s, tid_new, rid[j * m + k]);
qsched_adduse(&s, tid_new, tau_id[k*m+i]);
// qsched_addunlock(&s, tid[k * m + i], tid_new);
qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
tid[j * m + i] = tid_new;
}
}
} /* build the tasks. */
}
qsched_run_MPI(&s, nr_threads, runner);
// Print off a hello world message
printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
processor_name, s.rank, s.count_ranks);
//TODO Clean up the resource data.
qsched_free(&s);
#ifdef TASK_TIMERS
FILE *file = NULL;
if(s.rank == 0)
file = fopen("Task_timers0.out", "w");
else if (s.rank == 1)
file = fopen("Task_timers1.out", "w");
else if (s.rank == 2)
file = fopen("Task_timers2.out", "w");
else if (s.rank == 3)
file = fopen("Task_timers3.out", "w");
for(i = 0; i < m*n*K*K; i++)
{
if(task_types[i] > 0)
fprintf(file, "%i %i %lli %lli\n", task_types[i], task_tids[i],task_start[i], task_finish[i] );
}
fclose(file);
free(task_types);
free(task_tids);
free(task_start);
free(task_finish);
#endif
// Finalize the MPI environment.
MPI_Finalize();
}
/**
* @brief Main function.
*/
int main(int argc, char* argv[]) {
int c, nr_threads=1;
int M = 4, N = 4, runs = 1, K = 32;
/* Get the number of threads. */
//#pragma omp parallel shared(nr_threads)
//{
// if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
//}
/* Parse the options */
while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
case 'm':
if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
break;
case 'n':
if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
break;
case 'k':
if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
break;
case 'r':
if (sscanf(optarg, "%d", &runs) != 1)
error("Error parsing number of runs.");
break;
case 't':
if (sscanf(optarg, "%d", &nr_threads) != 1)
error("Error parsing number of threads.");
omp_set_num_threads(nr_threads);
message("omp_get_max_threads() = %i\n", omp_get_max_threads());
message("omp_get_num_procs() = %i\n", omp_get_num_procs());
break;
case '?':
fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
argv[0]);
fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
"matrix using nr_threads threads.\n");
exit(EXIT_FAILURE);
}
/* Dump arguments. */
message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
"threads (%i runs).",
32 * M, 32 * N, nr_threads, runs);
test_qr(M, N, K, nr_threads, runs, NULL);
}