diff --git a/examples/Makefile.am b/examples/Makefile.am index 75e1aa2245a836b2a10183f0ffcae94665fbada4..46d5cd60eb7d2162522416a6035dcdd4dc62e8cb 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -29,9 +29,9 @@ MPI_THREAD_LIBS = @MPI_THREAD_LIBS@ MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS) # Set-up the library -bin_PROGRAMS = test test_bh test_bh_sorted test_fmm_sorted test_bh_mpi +bin_PROGRAMS = test test_bh test_bh_sorted test_fmm_sorted test_bh_mpi test_qr_mpi if HAVECBLAS -bin_PROGRAMS += test_qr +bin_PROGRAMS += test_qr endif # Sources for test @@ -63,3 +63,8 @@ test_bh_mpi_SOURCES = test_bh_mpi.c test_bh_mpi_CFLAGS = $(AM_CFLAGS) -DWITH_MPI test_bh_mpi_LDADD = ../src/.libs/libquickschedMPI.a $(METIS_LIBS) test_bh_mpi_LDFLAGS = $(MPI_THREAD_LIBS) + +test_qr_mpi_SOURCES = test_qr_mpi.c +test_qr_mpi_CFLAGS = $(AM_CFLAGS) -DWITH_MPI +test_qr_mpi_LDADD = ../src/.libs/libquickschedMPI.a $(METIS_LIBS) +test_qr_mpi_LDFLAGS = $(MPI_THREAD_LIBS) diff --git a/examples/test_bh_mpi.c b/examples/test_bh_mpi.c index aff03025a545aa1b96759a5d04bebc2ab278a622..79ee40a04a0271b2640994986acd8ebcaebee83e 100644 --- a/examples/test_bh_mpi.c +++ b/examples/test_bh_mpi.c @@ -495,7 +495,7 @@ static inline void iact_pair_pc(struct qsched *s, struct cell *ci, struct cell * #ifdef SANITY_CHECKS if(cp->parts == NULL) { - error("cp->parts is NULL after we ttempted to get the data from the scheduler."); + error("cp->parts is NULL after we attempted to get the data from the scheduler."); } #endif /* Get local copies of the particle data in cp. */ @@ -1014,7 +1014,7 @@ void interact_exact(int N, struct part *parts) { int i, j, k; float ir, w, r2, dx[3]; - + message("Starting exact interaction"); /* Loop over all particles. */ for (i = 0; i < N; ++i) { @@ -1050,6 +1050,7 @@ void interact_exact(int N, struct part *parts) { parts[i].id, parts[i].a_exact[0], parts[i].a_exact[1], parts[i].a_exact[2]);*/ #endif + message("Completed exact interaction"); } void output_parts(struct qsched *s) @@ -1286,6 +1287,9 @@ for(i = 0; i < s.count_ranks; i++) } //Need to clean up everything. + free(parts); + //TODO Clean up the cell-resource data. + qsched_free(&s); // Finalize the MPI environment. MPI_Finalize(); diff --git a/examples/test_qr_mpi.c b/examples/test_qr_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..673f1ee5ce0512d0c160264011999d25c750b0b2 --- /dev/null +++ b/examples/test_qr_mpi.c @@ -0,0 +1,742 @@ +/******************************************************************************* + * This file is part of QuickSched. + * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * +* *****************************************************************************/ +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> +#include <fenv.h> +#include <mpi.h> + + +#ifdef WITH_CLBAS +#include <cblas.h> +#endif + +/* Local includes. */ +#include "quicksched.h" +#include "res.h" + + +#ifdef WITH_CBLAS +/** + * Takes a column major matrix, NOT tile major. size is length of a side of the + * matrix. Only works for square matrices. + * This function is simply for validation and is implemented naively as we know + * of no implementation to retrieve Q from the tiled QR. + */ +double* computeQ(double* HR, int size, int tilesize, double* tau, int tauNum) { + double* Q = malloc(sizeof(double) * size * size); + double* Qtemp = malloc(sizeof(double) * size * size); + double* w = malloc(sizeof(double) * size); + double* ww = malloc(sizeof(double) * size * size); + double* temp = malloc(sizeof(double) * size * size); + int i, k, l, j, n; + bzero(Q, sizeof(double) * size * size); + bzero(Qtemp, sizeof(double) * size * size); + bzero(ww, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + Q[i * size + i] = 1.0; + } + int numcoltile = size / tilesize; + int numrowtile = size / tilesize; + for (k = 0; k < numrowtile; k++) { + for (l = 0; l < tilesize; l++) { + bzero(Qtemp, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + Qtemp[i * size + i] = 1.0; + } + + for (i = k; i < numcoltile; i++) { + bzero(w, sizeof(double) * size); + + for (j = 0; j < tilesize; j++) { + w[i * tilesize + j] = + HR[(k * tilesize + l) * size + i * tilesize + j]; + } + w[k * tilesize + l] = 1.0; + if (k * tilesize + l > i * tilesize) { + for (j = 0; j < k * tilesize + l; j++) w[j] = 0.0; + } + + /* Compute (I - tau*w*w')' */ + for (j = 0; j < size; j++) { + for (n = 0; n < size; n++) { + if (j != n) + ww[n * size + j] = + -tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + else + ww[n * size + j] = + 1.0 - tau[(k * tilesize + l) * tauNum + i] * w[j] * w[n]; + } + } + + /* Qtemp = Qtemp * (I-tau*w*w')' */ + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Qtemp, size, ww, size, 0.0, temp, size); + double* b = Qtemp; + Qtemp = temp; + temp = b; + } + cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, size, size, size, + 1.0, Q, size, Qtemp, size, 0.0, temp, size); + double* b = Q; + Q = temp; + temp = b; + } + } + + free(Qtemp); + free(w); + free(ww); + free(temp); + return Q; +} + +#endif + +double* getR(double* HR, int size) { + double* R = malloc(sizeof(double) * size * size); + int i, j; + bzero(R, sizeof(double) * size * size); + for (i = 0; i < size; i++) { + for (j = 0; j <= i; j++) { + R[i * size + j] = HR[i * size + j]; + } + } + return R; +} + +void printMatrix(double* Matrix, int m, int n, int tilesize) { + int i, j; + + for (i = 0; i < m * tilesize; i++) { + for (j = 0; j < n * tilesize; j++) { + printf(" %.3f ", Matrix[j * m * tilesize + i]); + } + printf("\n"); + } +} + +double* columnToTile(double* columnMatrix, int size, int m, int n, + int tilesize) { + double* TileMatrix; + TileMatrix = malloc(sizeof(double) * size); + if (TileMatrix == NULL) error("failed to allocate TileMatrix"); + int i, j, k, l; + + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + double* tileStart = + &columnMatrix[i * m * tilesize * tilesize + j * tilesize]; + double* tilePos = + &TileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + for (k = 0; k < tilesize; k++) { + tileStart = &columnMatrix[i * m * tilesize * tilesize + + k * m * tilesize + j * tilesize]; + for (l = 0; l < tilesize; l++) { + tilePos[k * tilesize + l] = tileStart[l]; + } + } + } + } + + return TileMatrix; +} + +double* tileToColumn(double* tileMatrix, int size, int m, int n, int tilesize) { + double* ColumnMatrix; + ColumnMatrix = (double*)malloc(sizeof(double) * size); + if (ColumnMatrix == NULL) error("failed to allocate ColumnMatrix"); + int i, j, k, l; + for (i = 0; i < n; i++) { + for (j = 0; j < m; j++) { + /* Tile on ith column is at i*m*32*32.*/ + /* Tile on jth is at j*32*32 */ + double* tile = + &tileMatrix[i * m * tilesize * tilesize + j * tilesize * tilesize]; + /* Column starts at same position as tile. */ + /* Row j*32.*/ + double* tilePos = + &ColumnMatrix[i * m * tilesize * tilesize + j * tilesize]; + for (k = 0; k < tilesize; k++) { + for (l = 0; l < tilesize; l++) { + tilePos[l] = tile[l]; + } + /* Next 32 elements are the position of the tile in the next column.*/ + tile = &tile[tilesize]; + /* Move to the j*32th position in the next column. */ + tilePos = &tilePos[tilesize * m]; + } + } + } + return ColumnMatrix; +} + +/* Routines for the tiled QR decomposition.*/ + +/** + * + * @brief Computes the QR decomposition of a tile. + * + * @param cornerTile A pointer to the tile for which the decomposition is +* computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param k The value of k for the QR decomposition + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * + */ +void DGEQRF(double* restrict cornerTile, int tileSize, + double* restrict tauMatrix, int k, int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[tileSize]; + + /*Find the householder vector for each row. */ + for (i = 0; i < tileSize; i++) { + norm = 0.0; + /*Fill w with the vector.*/ + for (j = i; j < tileSize; j++) { + /* ith row is i*tileSize, only want elements on diagonal or below.*/ + w[j] = cornerTile[i * tileSize + j]; + /*Find the norm as well */ + norm = norm + w[j] * w[j]; + } + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; + + norm = sqrt(norm); + + u1 = w[i] - sign * norm; + + if (u1 != 0.0) { + for (j = i + 1; j < tileSize; j++) w[j] = w[j] / u1; + } else { + for (j = i + 1; j < tileSize; j++) w[j] = 0.0; + } + + if (norm != 0.0) + tau = -sign * u1 / norm; + else + tau = 0.0; + + /*Store the below diagonal vector */ + + for (j = i + 1; j < tileSize; j++) { + cornerTile[i * tileSize + j] = w[j]; + } + cornerTile[i * tileSize + i] = sign * norm; + w[i] = 1.0; + /* Apply the householder transformation to the rest of the tile, for + * everything to the right of the diagonal. */ + for (j = i + 1; j < tileSize; j++) { + /*Compute w'*A_j*/ + z = cornerTile[j * tileSize + i]; + for (n = i + 1; n < tileSize; n++) { + z = z + cornerTile[j * tileSize + n] * w[n]; + } + /* Tile(m,n) = Tile(m,n) - tau*w[n]* w'A_j */ + for (n = i; n < tileSize; n++) { + cornerTile[j * tileSize + n] = + cornerTile[j * tileSize + n] - tau * w[n] * z; + } + } + /* Store tau. We're on k*tileSize+ith row. kth column.*/ +// tauMatrix[(k * tileSize + i) * tauNum + k] = tau; + tauMatrix[i] = tau; + } +} + +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile for which the householder is stored. + * @param rowTiles The tile to which the householder is applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * + */ +void DLARFT(double* restrict cornerTile, double* restrict rowTile, int tileSize, + int jj, int kk, double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z = 0.0; + double w[tileSize]; + + /* For each row in the corner Tile*/ + for (i = 0; i < tileSize; i++) { + /*Get w for row i */ + for (j = i; j < tileSize; j++) { + w[j] = cornerTile[i * tileSize + j]; + } + w[i] = 1.0; + + /* Apply to the row Tile */ + for (j = 0; j < tileSize; j++) { + z = 0.0; + /* Below Diagonal!*/ + /*Compute w'*A_j*/ + for (n = i; n < tileSize; n++) { + z = z + w[n] * rowTile[j * tileSize + n]; + } + for (n = i; n < tileSize; n++) { + rowTile[j * tileSize + n] = + rowTile[j * tileSize + n] - + //tauMatrix[(kk * tileSize + i) * tauNum + kk] * w[n] * z; + tauMatrix[i] * w[n] * z; + } + } + } +} + +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile The corner tile for this value of k. + * @param columnTile The tile for which householders are computed. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * + */ +void DTSQRF(double* restrict cornerTile, double* restrict columnTile, + int tilesize, int ii, int kk, double* restrict tauMatrix, + int tauNum) { + int i, j, n; + double norm = 0.0, sign, u1, tau, z; + double w[2*tilesize]; + + /* For each column compute the householder vector. */ + for (i = 0; i < tilesize; i++) { + norm = 0.0; + w[i] = cornerTile[i * tilesize + i]; + norm = norm + w[i] * w[i]; + for (j = i + 1; j < tilesize; j++) { + w[j] = 0.0; + } + for (j = 0; j < tilesize; j++) { + w[tilesize + j] = columnTile[i * tilesize + j]; + norm = norm + w[tilesize + j] * w[tilesize + j]; + } + + norm = sqrt(norm); + if (w[i] >= 0.0) + sign = -1; + else + sign = 1; + + u1 = w[i] - sign * norm; + if (u1 != 0.0) { + for (j = i + 1; j < 2 * tilesize; j++) { + w[j] = w[j] / u1; + } + } else { + for (j = i + 1; j < 2 * tilesize; j++) w[j] = 0.0; + } + + if (norm != 0) + tau = -sign * u1 / norm; + else + tau = 0.0; + + /* Apply to each row to the right.*/ + for (j = i; j < tilesize; j++) { + /* Find w'*A_j, w is 0s except for first value with upper tile.*/ + z = 1.0 * cornerTile[j * tilesize + i]; + for (n = 0; n < tilesize; n++) { + z = z + w[tilesize + n] * columnTile[j * tilesize + n]; + } + /* Apply to upper tile.*/ + cornerTile[j * tilesize + i] = + cornerTile[j * tilesize + i] - tau * 1.0 * z; + for (n = i + 1; n < tilesize; n++) { + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - tau * w[n] * z; + } + /* Apply to lower tile.*/ + for (n = 0; n < tilesize; n++) { + columnTile[j * tilesize + n] = + columnTile[j * tilesize + n] - tau * w[tilesize + n] * z; + } + } + /* Store w*/ + for (j = 0; j < tilesize; j++) { + columnTile[i * tilesize + j] = w[tilesize + j]; + } +// tauMatrix[(kk * tilesize + i) * tauNum + ii] = tau; + tauMatrix[i] = tau; + } +} + +/** + * + * @brief Applies the householder factorisation of the corner to the row tile. + * + * @param cornerTile A pointer to the tile to have the householder applied. + * @param columnTile The tile in which the householders are stored. + * @param rowTile The upper tile to have the householders applied. + * @param tilesize The number of elements on a row/column of the tile. + * @param tauMatrix A pointer to the tau Matrix. + * @param ii The value of i for the QR decomposition. + * @param jj The value of j for the QR decomposition. + * @param kk The value of k for the QR decomposition. + * @param tauNum The number of tau values stored for each row of the matrix +* (this is equal to the number of tiles on each column). + * + * + */ +void DSSRFT(double* restrict cornerTile, double* restrict columnTile, + double* restrict rowTile, int tilesize, int ii, int jj, int kk, + double* restrict tauMatrix, int tauNum) { + int i, j, n; + double z; + double w[2*tilesize]; + + for (i = 0; i < tilesize; i++) { + for (j = 0; j < i; j++) w[j] = 0.0; + w[i] = 1.0; + for (j = i + 1; j < tilesize; j++) w[j] = 0.0; + for (j = 0; j < tilesize; j++) + w[j + tilesize] = columnTile[i * tilesize + j]; + + /* Apply householder vector (w) to the tiles.*/ + for (j = 0; j < tilesize; j++) { + z = 0.0; + /* Compute w' * A_j */ + for (n = 0; n < tilesize; n++) { + z += w[n] * rowTile[j * tilesize + n]; + z += w[n + tilesize] * cornerTile[j * tilesize + n]; + } + for (n = 0; n < tilesize; n++) { + rowTile[j * tilesize + n] = + rowTile[j * tilesize + n] - + //tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[n] * z; + tauMatrix[i] * w[n] * z; + cornerTile[j * tilesize + n] = + cornerTile[j * tilesize + n] - + // tauMatrix[(kk * tilesize + i) * tauNum + ii] * w[tilesize + n] * z; + tauMatrix[i] * w[tilesize+ n] * z; + } + } + } +} + + + +void test_qr(int m, int n, int K, int nr_threads, int runs, double* matrix) { + + int k, j, i; + double* A, *A_orig= NULL, *tau; + struct qsched s; + qsched_task_t* tid = NULL, tid_new; + qsched_res_t *rid = NULL, *tau_id = NULL; + double **rids = NULL, **taus = NULL; + int data[3]; + long long int MPI_data[6]; +// ticks tic, toc_run, tot_setup, tot_run = 0; +char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + + // Initialize the MPI environment +int MpiThreadLevel; + MPI_Init_thread(NULL, NULL, MPI_THREAD_MULTIPLE, &MpiThreadLevel); + +if(MpiThreadLevel != MPI_THREAD_MULTIPLE) + error("We didn't get thread multiple!"); + + MPI_Get_processor_name(processor_name, &name_len); + MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN ); + + enum task_types { + task_DGEQRF, + task_DLARFT, + task_DTSQRF, + task_DSSRFT + }; + + /* Runner function to pass to the scheduler. */ + void runner(struct qsched *s, int type, void * data) { + + /* Decode the task data. */ + long long int* idata = (long long int*)data; +// int i = idata[0], j = idata[1], k = idata[2]; + long long int i, j , a , t; + /* Need to pull the resources.*/ + + /* Decode and execute the task. */ + switch (type) { + case task_DGEQRF: + i = idata[0]; + j = idata[1]; + double *cornerTile = (double*)qsched_getresdata(s, i); + double *tau = (double*)qsched_getresdata(s, j); + DGEQRF(cornerTile, K, tau, k, m); + break; + case task_DLARFT: + t = idata[2]; + i = idata[0]; + j = idata[1]; + double *lockedTile = (double*)qsched_getresdata(s, i); + cornerTile = (double*)qsched_getresdata(s, j); + tau = (double*)qsched_getresdata(s, t); + DLARFT(cornerTile, lockedTile, K, j, k, tau, + m); + break; + case task_DTSQRF: + i = idata[0]; + j = idata[1]; + t = idata[2]; + lockedTile = (double*)qsched_getresdata(s, i); + cornerTile = (double*)qsched_getresdata(s, j); + tau = (double*)qsched_getresdata(s, t); + DTSQRF(cornerTile, lockedTile, K, i, k, tau, + m); + break; + case task_DSSRFT: + a = idata[2]; + i = idata[0]; + j = idata[1]; + t = idata[3]; + double *lockedTile1 = (double*)qsched_getresdata(s, i); + double *usedTile = (double*)qsched_getresdata(s, j); + double *lockedTile2 = (double*)qsched_getresdata(s, a); + tau = (double*)qsched_getresdata(s, t); + DSSRFT(lockedTile1, usedTile, + lockedTile2, K, i, j, k, tau, m); + break; +// default: + // error("Unknown task type."); + } + } + + qsched_init(&s, nr_threads, qsched_flag_noreown, MPI_COMM_WORLD); + +if(s.rank == 0){ + /* Allocate and fill the original matrix. */ + if ((A = (double*)malloc(sizeof(double)* m* n* K* K)) == NULL || + (tau = (double*)malloc(sizeof(double)* m* n* K)) == NULL || + (A_orig = (double*)malloc(sizeof(double) * m * n * K * K)) == NULL) + error("Failed to allocate matrices."); + for (k = 0; k < m * n * K * K; k++) { + if (matrix == NULL) + A_orig[k] = 2 * ((double)rand()) / RAND_MAX - 1.0; + else + A_orig[k] = matrix[k]; + } + memcpy(A, A_orig, sizeof(double) * m * n * K * K); + bzero(tau, sizeof(double) * m * n * K); + + /* Initialize the scheduler. */ + /* Allocate and init the task ID and resource ID matrix. */ +// tic = getticks(); + if ((tid = (qsched_task_t*)malloc(sizeof(qsched_task_t)* m* n)) == NULL || + (rid = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m * n)) == NULL) + error("Failed to allocate tid/rid matrix."); + if( (rids = (double**)malloc(sizeof(double*)*m*n)) == NULL) + error("Failed to allocate rids."); + if( (tau_id = (qsched_res_t*)malloc(sizeof(qsched_res_t) * m*n) ) == NULL) + error("Failed to allocate tau_id matrix."); + if( (taus = (double**)malloc(sizeof(double*)*m*n)) == NULL) + error("Failed to allocate taus."); + for (k = 0; k < m * n; k++) { + //tid[k] = qsched_task_none; + rid[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K * K, (void**) &rids[k]); + tau_id[k] = qsched_addres(&s, qsched_owner_none, sizeof(double) * K, (void**) &taus[k]); + memcpy(taus[k], tau, sizeof(double) * K); + } +} + message("Synchronizing resources."); + qsched_sync_resources(&s); + +if(s.rank == 0) { + for (k = 0; k < m * n; k++) { + tid[k] = qsched_task_none; + } +/* Build the tasks. */ + for (k = 0; k < m && k < n; k++) { + + /* Add kth corner task. */ + data[0] = k; + data[1] = k; + data[2] = k; + MPI_data[0] = rid[k*m+k]; + MPI_data[1] = tau_id[k*m+k]; + tid_new = qsched_addtask(&s, task_DGEQRF, task_flag_none, MPI_data, + sizeof(long long int) * 2, 200); + qsched_addlock(&s, tid_new, rid[k * m + k]); + qsched_addlock(&s, tid_new, tau_id[k*m+k]); + if(k == 0) + { + memcpy(rids[k*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); + } + if (tid[k * m + k] != -1) qsched_addunlock(&s, tid[k * m + k], tid_new); + tid[k * m + k] = tid_new; + + /* Add column tasks on kth row. */ + for (j = k + 1; j < n; j++) { + data[0] = k; + data[1] = j; + data[2] = k; + MPI_data[0] = rid[j*m+k]; + MPI_data[1] = rid[k*m+k]; + MPI_data[2] = tau_id[k*m+k]; + tid_new = qsched_addtask(&s, task_DLARFT, task_flag_none, MPI_data, + sizeof(long long int) * 3, 300); + if(k == 0) + { + memcpy(rids[j*m+k], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); + } + qsched_addlock(&s, tid_new, rid[j * m + k]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_adduse(&s, tid_new, tau_id[k*m+k]); + qsched_addunlock(&s, tid[k * m + k], tid_new); + if (tid[j * m + k] != -1) qsched_addunlock(&s, tid[j * m + k], tid_new); + tid[j * m + k] = tid_new; + } + + /* For each following row... */ + for (i = k + 1; i < m; i++) { + + /* Add the row taks for the kth column. */ + data[0] = i; + data[1] = k; + data[2] = k; + MPI_data[0] = rid[k*m+i]; + MPI_data[1] = rid[k*m+k]; + MPI_data[2] = tau_id[k*m+i]; + tid_new = qsched_addtask(&s, task_DTSQRF, task_flag_none, MPI_data, + sizeof(long long int) * 3, 300); + if(k == 0) + { + memcpy(rids[k*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); + } + qsched_addlock(&s, tid_new, rid[k * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + k]); + qsched_addlock(&s, tid_new, tau_id[k*m+i]); + qsched_addunlock(&s, tid[k * m + (i - 1)], tid_new); + if (tid[k * m + i] != -1) qsched_addunlock(&s, tid[k * m + i], tid_new); + tid[k * m + i] = tid_new; + + /* Add the inner tasks. */ + for (j = k + 1; j < n; j++) { + data[0] = i; + data[1] = j; + data[2] = k; + MPI_data[0] = rid[j*m+i]; + MPI_data[1] = rid[k*m+i]; + MPI_data[2] = rid[j*m+k]; + MPI_data[3] = tau_id[k*m+i]; + tid_new = qsched_addtask(&s, task_DSSRFT, task_flag_none, MPI_data, + sizeof(long long int) * 4, 500); + if(k == 0) + { + memcpy(rids[j*m+i], &A_orig[(data[1]*m+data[0])*K*K], sizeof(double)*K*K); + } + qsched_addlock(&s, tid_new, rid[j * m + i]); + qsched_adduse(&s, tid_new, rid[k * m + i]); + qsched_addlock(&s, tid_new, rid[j * m + k]); + qsched_adduse(&s, tid_new, tau_id[k*m+i]); + // qsched_addunlock(&s, tid[k * m + i], tid_new); + qsched_addunlock(&s, tid[j * m + i - 1], tid_new); + if (tid[j * m + i] != -1) qsched_addunlock(&s, tid[j * m + i], tid_new); + + tid[j * m + i] = tid_new; + } + } + + } /* build the tasks. */ +} +qsched_run_MPI(&s, nr_threads, runner); + // Print off a hello world message + printf("Hello world from processor %s, rank = %i, count_ranks = %i\n", + processor_name, s.rank, s.count_ranks); + + //TODO Clean up the resource data. + qsched_free(&s); + + // Finalize the MPI environment. + MPI_Finalize(); +} + +/** + * @brief Main function. + */ + +int main(int argc, char* argv[]) { + + int c, nr_threads=1; + int M = 4, N = 4, runs = 1, K = 32; + + /* Parse the options */ + while ((c = getopt(argc, argv, "m:n:k:r:t:")) != -1) switch (c) { + case 'm': + if (sscanf(optarg, "%d", &M) != 1) error("Error parsing dimension M."); + break; + case 'n': + if (sscanf(optarg, "%d", &N) != 1) error("Error parsing dimension M."); + break; + case 'k': + if (sscanf(optarg, "%d", &K) != 1) error("Error parsing tile size."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + omp_set_num_threads(nr_threads); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-m M] [-n N] [-k K]\n", + argv[0]); + fprintf(stderr, "Computes the tiled QR decomposition of an MxN tiled\n" + "matrix using nr_threads threads.\n"); + exit(EXIT_FAILURE); + } + + /* Dump arguments. */ + message("Computing the tiled QR decomposition of a %ix%i matrix using %i " + "threads (%i runs).", + 32 * M, 32 * N, nr_threads, runs); + + test_qr(M, N, K, nr_threads, runs, NULL); +} diff --git a/src/qsched.c b/src/qsched.c index 0b88d3f8480734ea262a7b0cb486cd931f577e19..a9ba6fb0ccc90f938291435580c1f12199dde38f 100644 --- a/src/qsched.c +++ b/src/qsched.c @@ -1621,8 +1621,6 @@ void *temp; t->node = ts->rank; i++; - if(i % 1000 == 0) - message("Added %i send/recv tasks", i/2); /* Return the task ID. */ return id; @@ -2999,6 +2997,8 @@ for(i = 0; i < node_count; i++) } message("My \"cost\" = %i", count_me); + + tic = getticks(); MPI_Request *reqs; reqs = (MPI_Request*) calloc(sizeof(MPI_Request) , node_count * 2); @@ -3183,8 +3183,13 @@ for(i = 0; i < node_count; i++) toc = getticks(); message("qsched_partition_create_sends took %lli (= %e) ticks\n", toc-tic, (float)(toc-tic)); - - + free(edge_lists); + free(edge_sizes); + free(edge_vwgts); + free(edgelist_pos); + free(edgelist_new); + free(edgelist_vwgt); + free(nodelist);