Select Git revision
test_qr_mpi.c
test_qr_mpi.c 26.45 KiB
/*******************************************************************************
* This file is part of QuickSched.
* Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* *****************************************************************************/
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <math.h>
#include <float.h>
#include <limits.h>
#include <omp.h>
#include <fenv.h>
#include <mpi.h>
#ifdef WITH_CLBAS
#include <cblas.h>
#endif
/* Local includes. */
#include "quicksched.h"
#include "res.h"
#define TASK_TIMERS
#ifdef WITH_CBLAS
/**
* Takes a column major matrix, NOT tile major. size is length of a side of the
* matrix. Only works for square matrices.
* This function is simply for validation and is implemented naively as we know
* of no implementation to retrieve Q from the tiled QR.
*/
double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) {
double* Q = malloc(sizeof(double) * size * size);
double* Qtemp = malloc(sizeof(double) * size * size);
double* w = malloc(sizeof(double) * size);
double* ww = malloc(sizeof(double) * size * size);
double* temp = malloc(sizeof(double) * size * size);
int i, k, l, j, n;
bzero(Q, sizeof(double) * size * size);
bzero(Qtemp, sizeof(double) * size * size);
bzero(ww, sizeof(double) * size * size);
for (i = 0; i < size; i++) {
Q[i * size + i] = 1.0;
}
int numcoltile = size / tilesize;
int numrowtile = size / tilesize;
for (k = 0; k < numrowtile; k++) {
for (l = 0; l < tilesize; l++) {
bzero(Qtemp, sizeof(double) * size * size);
for (i = 0; i < size; i++) { Qtemp[i * size + i] = 1.0;
}
for (i = k; i < numcoltile; i++) {
bzero(w, sizeof(double) * size);
for (j = 0; j < tilesize; j++) {
w[i * tilesize + j] =
HR[(k * tilesize + l) * size + i * tilesize + j];
}
w[k * tilesize + l] = 1.0;
if (k * tilesize + l > i * tilesize) {
for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0;
}
/* Compute (I - tau*w*w')' */
for (j = 0; j < size; j++) {
for (n = 0; n < size; n++) {
if (j != n)
ww[n * size + j] =
-tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
else
ww[n * size + j] =
1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n];
}
}
/* Qtemp = Qtemp * (I-tau*w*w')' */
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
1.0, Qtemp, size, ww, size, 0.0, temp, size);
double* b = Qtemp;
Qtemp = temp;
temp = b;
}
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size,
1.0, Q, size, Qtemp, size, 0.0, temp, size);
double* b = Q;
Q = temp;
temp = b;
}
}
free(Qtemp);
free(w);
free(ww);
free(temp);
return Q;
}
#endif
double* getR(double* HR, int size) {
double* R = malloc(sizeof(double) * size * size);
int i, j;
bzero(R, sizeof(double) * size * size);
for (i = 0; i < size; i++) {
for (j = 0; j <= i; j++) {
R[i * size + j] = HR[i * size + j];
}
}
return R;
}
void printMatrix(double* Matrix, int m, int n, int tilesize) {
int i, j;
for (i = 0; i < m * tilesize; i++) {
for (j = 0; j < n * tilesize; j++) {
printf(" %.3f ", Matrix[j * m * tilesize + i]);
} printf("\n");
}
}
double* columnToTile(double* columnMatrix, int size, int m, int n,
int tilesize) {
double* TileMatrix;
TileMatrix = malloc(sizeof(double) * size);
if (TileMatrix == NULL) error("failed to allocate TileMatrix");
int i, j, k, l;
for (i = 0; i < n; i++) {
for (j = 0; j < m; j++) {
double* tileStart =
&columnMatrix[i * m * tilesize * tilesize + j * tilesize];
double* tilePos =
&TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
for (k = 0; k < tilesize; k++) {
tileStart = &columnMatrix[i * m * tilesize * tilesize +
k * m * tilesize + j * tilesize];
for (l = 0; l < tilesize; l++) {
tilePos[k * tilesize + l] = tileStart[l];
}
}
}
}
return TileMatrix;
}
double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) {
double* ColumnMatrix;
ColumnMatrix = (double*)malloc(sizeof(double) * size);
if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix");
int i, j, k, l;
for (i = 0; i < n; i++) {
for (j = 0; j < m; j++) {
/* Tile on ith column is at i*m*32*32.*/
/* Tile on jth is at j*32*32 */
double* tile =
&tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize];
/* Column starts at same position as tile. */
/* Row j*32.*/
double* tilePos =
&ColumnMatrix[i * m * tilesize * tilesize + j * tilesize];
for (k = 0; k < tilesize; k++) {
for (l = 0; l < tilesize; l++) {
tilePos[l] = tile[l];
}
/* Next 32 elements are the position of the tile in the next column.*/
tile = &tile[tilesize];
/* Move to the j*32th position in the next column. */
tilePos = &tilePos[tilesize * m];
}
}
}
return ColumnMatrix;
}
/* Routines for the tiled QR decomposition.*/
/**
*
* @brief Computes the QR decomposition of a tile.
*
* @param cornerTile A pointer to the tile for which the decomposition is
* computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param k The value of k for the QR decomposition * @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DGEQRF(double* restrict cornerTile, int tileSize,
double* restrict tauMatrix, int k, int tauNum) {
int i, j, n;
double norm = 0.0, sign, u1, tau, z;
double w[tileSize];
/*Find the householder vector for each row. */
for (i = 0; i < tileSize; i++) {
norm = 0.0;
/*Fill w with the vector.*/
for (j = i; j < tileSize; j++) {
/* ith row is i*tileSize, only want elements on diagonal or below.*/
w[j] = cornerTile[i * tileSize + j];
/*Find the norm as well */
norm = norm + w[j] * w[j];
}
if (w[i] >= 0.0)
sign = -1;
else
sign = 1;
norm = sqrt(norm);
u1 = w[i] - sign * norm;
if (u1 != 0.0) {
for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1;
} else {
for (j = i + 1; j < tileSize; j++) w[j] = 0.0;
}
if (norm != 0.0)
tau = -sign * u1 / norm;
else
tau = 0.0;
/*Store the below diagonal vector */
for (j = i + 1; j < tileSize; j++) {
cornerTile[i * tileSize + j] = w[j];
}
cornerTile[i * tileSize + i] = sign * norm;
w[i] = 1.0;
/* Apply the householder transformation to the rest of the tile, for
* everything to the right of the diagonal. */
for (j = i + 1; j < tileSize; j++) {
/*Compute w'*A_j*/
z = cornerTile[j * tileSize + i];
for (n = i + 1; n < tileSize; n++) {
z = z + cornerTile[j * tileSize + n] * w[n];
}
/* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */
for (n = i; n < tileSize; n++) {
cornerTile[j * tileSize + n] =
cornerTile[j * tileSize + n] - tau * w[n] * z;
}
}
/* Store tau. We're on k*tileSize+ith row. kth column.*/
// tauMatrix[(k * tileSize + i) * tauNum + k] = tau;
tauMatrix[i] = tau;
}
}
/**
* * @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile A pointer to the tile for which the householder is stored.
* @param rowTiles The tile to which the householder is applied.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param jj The value of j for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize,
int jj, int kk, double* restrict tauMatrix, int tauNum) {
int i, j, n;
double z = 0.0;
double w[tileSize];
/* For each row in the corner Tile*/
for (i = 0; i < tileSize; i++) {
/*Get w for row i */
for (j = i; j < tileSize; j++) {
w[j] = cornerTile[i * tileSize + j];
}
w[i] = 1.0;
/* Apply to the row Tile */
for (j = 0; j < tileSize; j++) {
z = 0.0;
/* Below Diagonal!*/
/*Compute w'*A_j*/
for (n = i; n < tileSize; n++) {
z = z + w[n] * rowTile[j * tileSize + n];
}
for (n = i; n < tileSize; n++) {
rowTile[j * tileSize + n] =
rowTile[j * tileSize + n] -
//tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z;
tauMatrix[i] * w[n] * z;
}
}
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile The corner tile for this value of k.
* @param columnTile The tile for which householders are computed.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param ii The value of i for the QR decomposition.
* @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DTSQRF(double* restrict cornerTile, double* restrict columnTile,
int tilesize, int ii, int kk, double* restrict tauMatrix,
int tauNum) {
int i, j, n;
double norm = 0.0, sign, u1, tau, z;
double w[2*tilesize];
/* For each column compute the householder vector. */
for (i = 0; i < tilesize; i++) {
norm = 0.0; w[i] = cornerTile[i * tilesize + i];
norm = norm + w[i] * w[i];
for (j = i + 1; j < tilesize; j++) {
w[j] = 0.0;
}
for (j = 0; j < tilesize; j++) {
w[tilesize + j] = columnTile[i * tilesize + j];
norm = norm + w[tilesize + j] * w[tilesize + j];
}
norm = sqrt(norm);
if (w[i] >= 0.0)
sign = -1;
else
sign = 1;
u1 = w[i] - sign * norm;
if (u1 != 0.0) {
for (j = i + 1; j < 2 * tilesize; j++) {
w[j] = w[j] / u1;
}
} else {
for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0;
}
if (norm != 0)
tau = -sign * u1 / norm;
else
tau = 0.0;
/* Apply to each row to the right.*/
for (j = i; j < tilesize; j++) {
/* Find w'*A_j, w is 0s except for first value with upper tile.*/
z = 1.0 * cornerTile[j * tilesize + i];
for (n = 0; n < tilesize; n++) {
z = z + w[tilesize + n] * columnTile[j * tilesize + n];
}
/* Apply to upper tile.*/
cornerTile[j * tilesize + i] =
cornerTile[j * tilesize + i] - tau * 1.0 * z;
for (n = i + 1; n < tilesize; n++) {
cornerTile[j * tilesize + n] =
cornerTile[j * tilesize + n] - tau * w[n] * z;
}
/* Apply to lower tile.*/
for (n = 0; n < tilesize; n++) {
columnTile[j * tilesize + n] =
columnTile[j * tilesize + n] - tau * w[tilesize + n] * z;
}
}
/* Store w*/
for (j = 0; j < tilesize; j++) {
columnTile[i * tilesize + j] = w[tilesize + j];
}
// tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau;
tauMatrix[i] = tau;
}
}
/**
*
* @brief Applies the householder factorisation of the corner to the row tile.
*
* @param cornerTile A pointer to the tile to have the householder applied.
* @param columnTile The tile in which the householders are stored.
* @param rowTile The upper tile to have the householders applied.
* @param tilesize The number of elements on a row/column of the tile.
* @param tauMatrix A pointer to the tau Matrix.
* @param ii The value of i for the QR decomposition.
* @param jj The value of j for the QR decomposition. * @param kk The value of k for the QR decomposition.
* @param tauNum The number of tau values stored for each row of the matrix
* (this is equal to the number of tiles on each column).
*
*
*/
void DSSRFT(double* restrict cornerTile, double* restrict columnTile,
double* restrict rowTile, int tilesize, int ii, int jj, int kk,
double* restrict tauMatrix, int tauNum) {
int i, j, n;
double z;
double w[2*tilesize];
for (i = 0; i < tilesize; i++) {
for (j = 0; j < i; j++) w[j] = 0.0;
w[i] = 1.0;
for (j = i + 1; j < tilesize; j++) w[j] = 0.0;
for (j = 0; j < tilesize; j++)
w[j + tilesize] = columnTile[i * tilesize + j];
/* Apply householder vector (w) to the tiles.*/
for (j = 0; j < tilesize; j++) {
z = 0.0;
/* Compute w' * A_j */
for (n = 0; n < tilesize; n++) {
z += w[n] * rowTile[j * tilesize + n];
z += w[n + tilesize] * cornerTile[j * tilesize + n];
}
for (n = 0; n < tilesize; n++) {
rowTile[j * tilesize + n] =
rowTile[j * tilesize + n] -
//tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z;
tauMatrix[i] * w[n] * z;
cornerTile[j * tilesize + n] =
cornerTile[j * tilesize + n] -
// tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z;
tauMatrix[i] * w[tilesize+ n] * z;
}
}
}
}
void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) {
int k, j, i;
double* A, *A_orig= NULL, *tau;
struct qsched s;
qsched_task_t* tid = NULL, tid_new = -1;
qsched_res_t *rid = NULL, *tau_id = NULL;
double **rids = NULL, **taus = NULL;
int data[3];
#ifdef TASK_TIMERS
long long int MPI_data[7];
#else
long long int MPI_data[6];
#endif
// ticks tic, toc_run, tot_setup, tot_run = 0;
#ifdef TASK_TIMERS
ticks tic, toc_run;
#endif
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
#ifdef TASK_TIMERS
long long int *task_start;
long long int *task_finish;
int *task_tids;
int *task_types;
#endif
// Initialize the MPI environment
int MpiThreadLevel;
MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &MpiThreadLevel);
if(MpiThreadLevel != MPI_THREAD_MULTIPLE)
error("We didn't get thread multiple!");
MPI_Get_processor_name(processor_name, &name_len);
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN );
enum task_types {
task_DGEQRF=1,
task_DLARFT,
task_DTSQRF,
task_DSSRFT
};
printf("Task_DGEQRF = %i\n", task_DGEQRF);
/* Runner function to pass to the scheduler. */
void runner(struct qsched *s, int type, void * data) {
#ifdef TASK_TIMERS
tic = getticks();
#endif
/* Decode the task data. */
long long int* idata = (long long int*)data;
// int i = idata[0], j = idata[1], k = idata[2];
long long int i, j , a , t;
long long int tid = -1;
/* Need to pull the resources.*/
/* Decode and execute the task. */
switch (type) {
case task_DGEQRF:
i = idata[0];
j = idata[1];
double *cornerTile = (double*)qsched_getresdata(s, i);
double *tau = (double*)qsched_getresdata(s, j);
DGEQRF(cornerTile, K, tau, k, m);
#ifdef TASK_TIMERS
tid = (idata[2] * m * m);
#endif
break;
case task_DLARFT:
t = idata[2];
i = idata[0];
j = idata[1];
double *lockedTile = (double*)qsched_getresdata(s, i);
cornerTile = (double*)qsched_getresdata(s, j);
tau = (double*)qsched_getresdata(s, t);
DLARFT(cornerTile, lockedTile, K, j, k, tau,
m);
#ifdef TASK_TIMERS
tid = (idata[3] * m * m) + (idata[4] * m);
#endif
break;
case task_DTSQRF:
i = idata[0];
j = idata[1];
t = idata[2];
lockedTile = (double*)qsched_getresdata(s, i);
cornerTile = (double*)qsched_getresdata(s, j);
tau = (double*)qsched_getresdata(s, t);
DTSQRF(cornerTile, lockedTile, K, i, k, tau,
m);
#ifdef TASK_TIMERS
tid = (idata[3] * m * m) + (idata[3] * m) + idata[4];
#endif
break;
case task_DSSRFT: a = idata[2];
i = idata[0];
j = idata[1];
t = idata[3];
double *lockedTile1 = (double*)qsched_getresdata(s, i);
double *usedTile = (double*)qsched_getresdata(s, j);
double *lockedTile2 = (double*)qsched_getresdata(s, a);
tau = (double*)qsched_getresdata(s, t);
DSSRFT(lockedTile1, usedTile,
lockedTile2, K, i, j, k, tau, m);
#ifdef TASK_TIMERS
tid = (idata[4] * m * m) + (idata[6] * m) + (idata[5]);
#endif
break;
// default:
// error("Unknown task type.");
}
#ifdef TASK_TIMERS
if(type > 0){
toc_run = getticks();
task_start[tid] = tic;
task_finish[tid] = toc_run;
task_tids[tid] = omp_get_thread_num();
task_types[tid] = type;
}
#endif
}
qsched_init(&s, 2, qsched_flag_none, MPI_COMM_WORLD);
#ifdef TASK_TIMERS
task_start = (long long int*)calloc(sizeof(long long int), m*n*K*K);
printf("Created task_start of size %i", m*n*K*K);
task_finish = (long long int*)calloc(sizeof(long long int), m*n*K*K);
task_tids = (int*)calloc(sizeof(int), m*n*K*K);
task_types = (int*)calloc(sizeof(int), m*n*K*K);
#endif
if(s.rank == 0){
/* Allocate and fill the original matrix. */
if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL ||
(tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL ||
(A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL)
error("Failed to allocate matrices.");
for (k = 0; k < m * n * K * K; k++) {
if (matrix == NULL)
A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0;
else
A_orig[k] = matrix[k];
}
memcpy(A, A_orig, sizeof(double) * m * n * K * K);
bzero(tau, sizeof(double) * m * n * K);
/* Initialize the scheduler. */
/* Allocate and init the task ID and resource ID matrix. */
// tic = getticks();
if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL ||
(rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL)
error("Failed to allocate tid/rid matrix.");
if( (rids = (double**)malloc(sizeof(double*)*m*n)) == NULL)
error("Failed to allocate rids.");
if( (tau_id = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m*n) ) == NULL)
error("Failed to allocate tau_id matrix.");
if( (taus = (double**)malloc(sizeof(double*)*m*n)) == NULL)
error("Failed to allocate taus.");
for (k = 0; k < m * n; k++) {
//tid[k] = qsched_task_none;
rid[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K * K, (void**) &rids[k]);
tau_id[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K, (void**) &taus[k]);
memcpy(taus[k], tau, sizeof(double) * K); }
}
message("Synchronizing resources.");
qsched_sync_resources(&s);
if(s.rank == 0) {
for (k = 0; k < m * n; k++) {
tid[k] = qsched_task_none;
}
/* Build the tasks. */
for (k = 0; k < m && k < n; k++) {
/* Add kth corner task. */
data[0] = k;
data[1] = k;
data[2] = k;
MPI_data[0] = rid[k*m+k];
MPI_data[1] = tau_id[k*m+k];
#ifdef TASK_TIMERS
MPI_data[2] = k;
tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
sizeof(long long int) * 3, 300);
#else
tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data,
sizeof(long long int) * 2, 200);
#endif
qsched_addlock(&s, tid_new, rid[k * m + k]);
qsched_addlock(&s, tid_new, tau_id[k*m+k]);
if(k == 0)
{
memcpy(rids[k*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new);
tid[k * m + k] = tid_new;
/* Add column tasks on kth row. */
for (j = k + 1; j < n; j++) {
data[0] = k;
data[1] = j;
data[2] = k;
MPI_data[0] = rid[j*m+k];
MPI_data[1] = rid[k*m+k];
MPI_data[2] = tau_id[k*m+k];
#ifdef TASK_TIMERS
MPI_data[3] = k;
MPI_data[4] = j;
tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
sizeof(long long int) * 5, 300);
#else
tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data,
sizeof(long long int) * 3, 300);
#endif
if(k == 0)
{
memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
qsched_addlock(&s, tid_new, rid[j * m + k]);
qsched_adduse(&s, tid_new, rid[k * m + k]);
qsched_adduse(&s, tid_new, tau_id[k*m+k]);
qsched_addunlock(&s, tid[k * m + k], tid_new);
if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new);
tid[j * m + k] = tid_new;
}
/* For each following row... */
for (i = k + 1; i < m; i++) {
/* Add the row taks for the kth column. */
data[0] = i;
data[1] = k; data[2] = k;
MPI_data[0] = rid[k*m+i];
MPI_data[1] = rid[k*m+k];
MPI_data[2] = tau_id[k*m+i];
#ifdef TASK_TIMERS
MPI_data[3] = k;
MPI_data[4] = i;
tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
sizeof(long long int) * 5, 300);
#else
tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data,
sizeof(long long int) * 3, 300);
#endif
if(k == 0)
{
memcpy(rids[k*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
qsched_addlock(&s, tid_new, rid[k * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + k]);
qsched_addlock(&s, tid_new, tau_id[k*m+i]);
qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new);
if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new);
tid[k * m + i] = tid_new;
/* Add the inner tasks. */
for (j = k + 1; j < n; j++) {
data[0] = i;
data[1] = j;
data[2] = k;
MPI_data[0] = rid[j*m+i];
MPI_data[1] = rid[k*m+i];
MPI_data[2] = rid[j*m+k];
MPI_data[3] = tau_id[k*m+i];
#ifdef TASK_TIMERS
MPI_data[4] = k;
MPI_data[5] = i;
MPI_data[6] = j;
tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
sizeof(long long int) * 7, 300);
#else
tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data,
sizeof(long long int) * 4, 500);
#endif
if(k == 0)
{
memcpy(rids[j*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K);
}
qsched_addlock(&s, tid_new, rid[j * m + i]);
qsched_adduse(&s, tid_new, rid[k * m + i]);
qsched_addlock(&s, tid_new, rid[j * m + k]);
qsched_adduse(&s, tid_new, tau_id[k*m+i]);
// qsched_addunlock(&s, tid[k * m + i], tid_new);
qsched_addunlock(&s, tid[j * m + i - 1], tid_new);
if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new);
tid[j * m + i] = tid_new;
}
}
} /* build the tasks. */
}
qsched_run_MPI(&s, nr_threads, runner);
// Print off a hello world message
printf("Hello world from processor %s, rank = %i, count_ranks = %i\n",
processor_name, s.rank, s.count_ranks);
//TODO Clean up the resource data.
qsched_free(&s);
#ifdef TASK_TIMERS
FILE *file = NULL; if(s.rank == 0)
file = fopen("Task_timers0.out", "w");
else if (s.rank == 1)
file = fopen("Task_timers1.out", "w");
else if (s.rank == 2)
file = fopen("Task_timers2.out", "w");
else if (s.rank == 3)
file = fopen("Task_timers3.out", "w");
for(i = 0; i < m*n*K*K; i++)
{
if(task_types[i] > 0)
fprintf(file, "%i %i %lli %lli\n", task_types[i], task_tids[i],task_start[i], task_finish[i] );
}
fclose(file);
free(task_types);
free(task_tids);
free(task_start);
free(task_finish);
#endif
// Finalize the MPI environment.
MPI_Finalize();
}
/**
* @brief Main function.
*/
int main(int argc, char* argv[]) {
int c, nr_threads=1;
int M = 4, N = 4, runs = 1, K = 32;
/* Get the number of threads. */
//#pragma omp parallel shared(nr_threads)
//{
// if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads();
//}
/* Parse the options */
while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) {
case 'm':
if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M.");
break;
case 'n':
if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M.");
break;
case 'k':
if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size.");
break;
case 'r':
if (sscanf(optarg, "%d", &runs) != 1)
error("Error parsing number of runs.");
break;
case 't':
if (sscanf(optarg, "%d", &nr_threads) != 1)
error("Error parsing number of threads.");
omp_set_num_threads(nr_threads);
message("omp_get_max_threads() = %i\n", omp_get_max_threads());
message("omp_get_num_procs() = %i\n", omp_get_num_procs());
break;
case '?':
fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n",
argv[0]);
fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n"
"matrix using nr_threads threads.\n");
exit(EXIT_FAILURE);
}
/* Dump arguments. */
message("Computing the tiled QR decomposition of a %ix%i matrix using %i "
"threads (%i runs).",
32 * M, 32 * N, nr_threads, runs);
test_qr(M, N, K, nr_threads, runs, NULL);
}